In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

## Setup

In [None]:
df = pd.read_csv('train_and_test.csv', index_col=0)
features = df[df.columns[:54]]
target = df['Cover_Type']
feature_names = list(df)

## PCA

In [None]:
pca_feat = PCA() # this is the PCA object
pca_feat.fit(features) # fit the PCA object on the features that you have
n_comp = 5
pca_n = PCA(n_components=n_comp)
features_transformed_pca = pca_n.fit_transform(features)

## Preparing the data for PCA

In [None]:
np.random.seed(42)
# Separate into training (80%) and testing (20%):
msk = np.random.rand(len(features_transformed_pca)) < 0.8
train_features_qda = features_transformed_pca[msk]
train_target_qda = target[msk]
train_features_lda = features_transformed_pca[msk]
train_target_lda = target[msk]
test_features_qda = features_transformed_pca[~msk]
test_target_qda = target[~msk]
test_features_lda = features_transformed_pca[~msk]
test_target_lda = target[~msk]

## LDA

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(train_features_lda, train_target_lda)
predicted_lda = lda.predict(test_features_lda)
actual_lda = np.array(test_target_lda)

## QDA

In [None]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train_features_qda, train_target_qda)
predicted_qda = qda.predict(test_features_qda)
actual_qda = np.array(test_target_qda)

## Cross Validation

In [None]:
def k_fold_split(df, fold_number, target_fold_index):
    size = len(df)
    test_mask = np.array([False]*size,  dtype=bool)
    for i in range(size-target_fold_index):
        if i%fold_number==0: test_mask[i+ target_fold_index] = True
    
    train = df[~test_mask]
    test= df[test_mask]
    
    return train, test


# pca_lda
def lda_qda_knn(train, test, lda_huh, pca_comp_n=2, bootstrap=True):
    # resample data:
    train = resample(train, replace=bootstrap)

    train_features = train[train.columns[:54]]
    train_target = train["Cover_Type"]
    test_features = test[test.columns[:54]]
    test_target = test["Cover_Type"]

    # do the PCA()
    pca_space = PCA(n_components=pca_comp_n)
    pca_space.fit(train_features) # "Fit" PCA to data
    train_transformed = pca_space.transform(train_features) # "transform" data to fitted pca
    test_transformed = pca_space.transform(test_features) # "transform" test to fitted pca

    # lda & qda:
    if lda_huh:
        model = LinearDiscriminantAnalysis()
    else:
        model = QuadraticDiscriminantAnalysis()

    model.fit(train_transformed, train_target)

    # make predctions based on KNN:
    predicted = model.predict(test_transformed)
    actual = np.array(test_target)


    # tally up the results:
    result = [True]*len(actual)
    for i in range(len(actual)):
        if predicted[i] != actual[i]:
            result[i] = False

    correct_count = [i for i in result if i==True]
    correct_rate = float(len(correct_count))/len(actual)

    return correct_rate

## Printing the results

In [None]:
pca_comp = 5

# Cross validation parameters:
fold = 10
bootsrap_rounds = 10

for j in range(0, 2):
    cv = np.empty(shape=(bootsrap_rounds, fold))
    for fold_index in range(fold):
        train, test = k_fold_split(df, fold, fold_index)
        for i in range(bootsrap_rounds):
            correct_rate = lda_qda_knn(train, test, pca_comp, j)
            cv[i, fold_index] = correct_rate

    cv_flat = cv.flatten()
    bin_size = (np.amin(cv_flat) - np.amax(cv_flat))/100
    bins = np.linspace(np.amin(cv), np.amax(cv), 20)

    np.mean(cv_flat)
    plt.hist(cv_flat, bins=bins)
    plt.show()
    j += 1