### Some Utilities Functions:

1. k_fold_split(df= "Data Frame", fold_number= "Number 'folds' for the splitting", target_fold_index= "Index of subset used for testing").

    Returns the two data frames, train for training, and test for testing.
    
2. pca_knn( train="Training Set", test="Testing Set", pca_comp_n="Number of PCA components to keep", k_neighbors= "Number of neighbors for KNN", bootstrap= "boolean for bootstrap procedure" )
    Returns the correct prediction rate

In [3]:
# k_fold_split
import numpy as np

def k_fold_split(df, fold_number, target_fold_index):
    size = len(df)
    test_mask = np.array([False]*size,  dtype=bool)
    for i in range(size-target_fold_index):
        if i%fold_number==0: test_mask[i + target_fold_index] = True
    
    train = df[~test_mask]
    test= df[test_mask]
    
    return train, test

In [9]:
# pca_knn

import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

def pca_knn(train, test, pca_comp_n = 2, k_neighbors = 6, bootstrap=True):
    # resample data:
    train = resample(train, replace = bootstrap)
    
    train_features = train[train.columns[:54]]
    train_target = train["Cover_Type"]
    test_features = test[test.columns[:54]]
    test_target = test["Cover_Type"]
    
    # do the PCA()
    pca_space = PCA(n_components = pca_comp_n)
    pca_space.fit(train_features) # "Fit" PCA to data
    train_transformed = pca_space.transform(train_features) # "transform" data to fitted pca
    test_transformed = pca_space.transform(test_features) # "transform" test to fitted pca
    
    # knn:
    knn = KNeighborsClassifier(n_neighbors= k_neighbors)
    knn.fit(train_transformed, train_target)
    
    # make predctions based on KNN:
    predicted = knn.predict(test_transformed)
    actual = np.array(test_target)

    
    # tally up the results:
    result = [True]*len(actual)
    for i in range(len(actual)):
        if predicted[i] != actual[i]:
            result[i] = False
    
    correct_count = [i for i in result if i==True]
    correct_rate = float(len(correct_count))/len(actual)
    
    return correct_rate