# Imports

In [27]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

#Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Performance metrics report

In [28]:
# Function that computes a confusion matrix which is used to compute the below functions
def cm_maker(y, ypred, n_classes):
    
    low = [1, 2, 3]
    high = [5, 6, 7]
    
    cm = np.zeros((n_classes, n_classes))
    
    for i, j in zip(ypred, y):
        
        if i in low:
            i = i - 1
        if i in high:
            i = i - 2
            
        if j in low:
            j = j - 1
        if j in high:
            j = j - 2
            
        cm[int(i), int(j)] += 1
        
    return cm


# Function computes precision score
def preci(cm, c):
    
    if sum(cm[c,:]) == 0:
        return 0
    else:
        return cm[c,c]/sum(cm[c,:])


# Function computes recall score
def recall(cm, c):
    
    return cm[c,c]/sum(cm[:,c])


# Function computes f1-score
def f1(cm, c):
    if (preci(cm,c) + recall(cm,c)) == 0:
        return 0
    else: 
        return 2 * (preci(cm,c) * recall(cm,c)) / (preci(cm,c) + recall(cm,c))
    

# Function computes weighted f1-score
def weighted_f1(cm, n_classes):
    co_su=cm.sum(axis=0)
    n=cm.sum()
    
    weighted_f1_sum = 0
    
    for c in range(n_classes):
        if co_su[c] != 0:
            weighted_f1_sum += f1(cm, c) * co_su[c] / n

    return round(weighted_f1_sum, 3)
        

# Function computes macro f1-score
def macro_f1(cm, n_classes):
    
    f1_sum = 0
    
    for i in range(n_classes):
        f1_sum += f1(cm, i)
    
    return round(f1_sum / n_classes, 3)


# Function to get accuracy
def accuracy(test_y, ypred):
    
    # Count of times where true labels equal predictions
    true_positives = 0
    for i in range(len(test_y)):
        if test_y[i] == ypred[i]:
            true_positives += 1
    
    return true_positives / len(test_y)


# Combines functions above for a coherent performance report
def performance_report(test_y, ypred, n_classes):
    
    class_labels = [1, 2, 3, 5, 6, 7]
    cm = cm_maker(test_y, ypred, n_classes)
    
    print('\nConfusion matrix for prediction:\n', cm)
    print('\n\nAccuracy for prediction:\n', accuracy(test_y, ypred))
    
    print('\n\nMetrics for classes')
    print('_______________________________________________________________________________')
    print('Class\t|\tPrecision\t|\tRecall\t\t|\tF1 Score')
    print('_______________________________________________________________________________')
    
    for i in range(n_classes):
        
        print('\nClass', class_labels[i],'|\t',round(preci(cm, i), 3),'\t\t|\t',round(recall(cm, i), 3),'\t\t|\t',round(f1(cm, i), 3))
    
    print('\n\nWeighted F1 score:\n', weighted_f1(cm, n_classes))
    print('\nMacro F1 score:\n', macro_f1(cm, n_classes))
    
    

# Principal Component Analysis to tranform data

In [29]:
# Implementation of PCA using numpy
def PCA_dim_reduction(train_file, test_file, n_components):
    
    # Read data
    train = np.loadtxt(train_file, skiprows=1, delimiter=",")
    test = np.loadtxt(test_file, skiprows=1, delimiter=",")

    train_x = train[:,:-1]  
    train_y = train[:,-1]

    test_x = test[:,:-1]
    test_y = test[:,-1]
    
    # Standardize data
    mean = np.mean(train_x, axis=0)
    std = np.std(train_x, axis=0)
    
    train_x = (train_x - mean) / std
    test_x = (test_x - mean) / std
    
    # Get covariance matrix
    covmat = np.cov(train_x.T)
    
    # Get eigenvalues
    eigval, eigvec = np.linalg.eig(covmat)
    
    # Get explained variance (if needed)
    exp_var = []
    for i in range(len(eigval)):
        exp_var.append(eigval[i]/np.sum(eigval))
    exp_var = sorted(exp_var, reverse=True)

    # Pair up eigenvalues and vectors, so the vectors can be ranked by values
    eigenpairs = []
    for i in range(len(eigval)):
        eigenpairs.append((eigval[i]/np.sum(eigval), eigvec[:, i].T))
    eigenpairs = sorted(eigenpairs, reverse=True)
    
    # Put in the vectors from most to least important in a list
    sorted_vectors = []
    for i in range(len(eigenpairs)):
        sorted_vectors.append(eigenpairs[i][1])
    
    # Make new variables to store the transformed data (the n principle coponents)
    new_train_x = np.zeros((len(train_x), n_components))
    new_test_x = np.zeros((len(test_x), n_components))
    
    # Transform the data into n principle components on the new subspace
    for i in range(n_components):
        new_train_x[:,i] = train_x @ sorted_vectors[i]
        new_test_x[:,i] = test_x @ sorted_vectors[i]
    
    # Store transformed data (pc's) in old variables
    train_x = new_train_x
    test_x = new_test_x
    
    # Return the transformed train and test data + class labels
    return train_x, train_y, test_x, test_y

# Finetuned Voting Classifier on Multi-Class PC(n=5) data

In [30]:
train_x, train_y, test_x, test_y = PCA_dim_reduction('df_train.csv', 'df_test.csv', 5)

clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=2, min_samples_split=9)
clf3 = SVC(C=1000, gamma=0.01, probability=True)
clf4 = KNeighborsClassifier(n_neighbors=2)

base_models = [('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)]

meta_model = VotingClassifier(estimators=base_models, voting = 'soft', weights=[3, 3, 5, 3]) # Weight chosen by finetuning

meta_model.fit(train_x, train_y)

pred_y = meta_model.predict(test_x)

performance_report(test_y, meta_model.predict(test_x), 6)


Confusion matrix for prediction:
 [[21.  1.  4.  0.  1.  0.]
 [ 0. 21.  1.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  3.  0.  1.]
 [ 0.  0.  0.  0.  2.  0.]
 [ 0.  0.  0.  0.  0.  8.]]


Accuracy for prediction:
 0.8461538461538461


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.778 		|	 1.0 		|	 0.875

Class 2 |	 0.913 		|	 0.913 		|	 0.913

Class 3 |	 0 		|	 0.0 		|	 0

Class 5 |	 0.6 		|	 0.75 		|	 0.667

Class 6 |	 1.0 		|	 0.667 		|	 0.8

Class 7 |	 1.0 		|	 0.889 		|	 0.941


Weighted F1 score:
 0.814

Macro F1 score:
 0.699


# Finetuned Voting Classifier on Binary (window/non-window) PC(n=5) data

In [11]:
"""
Function to read in the data, transform the data to the first 5 principal components,
and then make the data binary by combining window classes (1) and non-window classes (2)
in two seperate classes.
"""
def binary_classification_data(train_file, test_file):    
    # Read in the top 5 Principal Components
    train_x, train_y, test_x, test_y = PCA_dim_reduction(train_file, test_file, 5)

    # Divide test and train data into features and class labels
    window = [1,2,3]

    for i in range(len(train_y)):
        if train_y[i] in window:
            train_y[i] = 1
        else:
            train_y[i] = 2

    for i in range(len(test_y)):
        if test_y[i] in window:
            test_y[i] = 1
        else:
            test_y[i] = 2
    
    return train_x, train_y, test_x, test_y

In [32]:
# Read in binary train and test data + labels and store in variables
train_x, train_y, test_x, test_y = binary_classification_data('df_train.csv', 'df_test.csv')

# Our base models/classifiers
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier()
clf3 = SVC(probability=True)  # Probability set to True to enable soft voting (prediction probability based voting)
clf4 = KNeighborsClassifier()

# Stored in a list with a label to access them
base_models = [('QDA', clf1), ('DT', clf2), ('SVC', clf3), ('KNN', clf4)]

# Framework for voting classifier with soft voting enabled and our vanilla estimators
clf = VotingClassifier(estimators=base_models, voting='soft')

# A dictionary containing the hyperparameter names and the values we wish to include in the combinations
params = {'DT__max_depth':range(1,10),
          'DT__min_samples_split':range(2,10),
          'SVC__C':[1, 10, 100, 1000],
          'SVC__gamma':[0.1, 0.01, 0.001, 0.0001, 0.00001],
          'KNN__n_neighbors':range(1,15)
}

# 5-fold grid-search with backend print
grid = GridSearchCV(clf, params, cv=5, verbose=1, n_jobs=-1)
grid.fit(train_x, train_y)

# Create a variable for the best estimator
best_model = grid.best_estimator_

# Fit the model to the training data
best_model.fit(train_x, train_y)

# Make the model's prediction on the test data
pred_y = best_model.predict(test_x)

# Print the performance of the grid-searched model
performance_report(test_y, best_model.predict(test_x), 2)

Fitting 5 folds for each of 20160 candidates, totalling 100800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 1812 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 5312 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 10212 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 16512 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done 24212 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 33312 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 43812 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 55712 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 69012 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 83712 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 99812 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100800 out of 100800 | elapsed:  3.4min finished



Confusion matrix for prediction:
 [[47.  1.]
 [ 2. 15.]]


Accuracy for prediction:
 0.9538461538461539


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.979 		|	 0.959 		|	 0.969

Class 2 |	 0.882 		|	 0.938 		|	 0.909


Weighted F1 score:
 0.954

Macro F1 score:
 0.939


# Finetuned Binary Voting Classifier

In [31]:
# Read in binary window/non-window 5 component pca-data
train_x, train_y, test_x, test_y = binary_classification_data('df_train.csv', 'df_test.csv')

# Try model from search above, if none then use earlier model
try:
    clf = best_model 
except NameError:
    clf = VotingClassifier(estimators=
                            [('QDA', QuadraticDiscriminantAnalysis()),
                             ('DT', DecisionTreeClassifier(max_depth=2, min_samples_split=8)),
                             ('SVC', SVC(C=100, gamma=0.1, probability=True)),
                             ('KNN', KNeighborsClassifier(n_neighbors=1))],
                        voting='soft')

# fit the model
clf.fit(train_x, train_y)

# get predictions from the voting classifier
pred_y = clf.predict(test_x)

# print performance metrics from the predictions on the test data
performance_report(test_y, clf.predict(test_x), 2)


Confusion matrix for prediction:
 [[47.  1.]
 [ 2. 15.]]


Accuracy for prediction:
 0.9538461538461539


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.979 		|	 0.959 		|	 0.969

Class 2 |	 0.882 		|	 0.938 		|	 0.909


Weighted F1 score:
 0.954

Macro F1 score:
 0.939


# Search for the best weights (range 1-5 for all weights)

In [14]:
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=2, min_samples_split=9)
clf3 = SVC(C=1000, gamma=0.01, probability=True)
clf4 = KNeighborsClassifier(n_neighbors=2)

base_models = [('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)]

weight = []
i, j, k, l = 1,1,1,1
for i in range(1,6):
    weight.append([i, j, k, l])
    for j in range(1,6):
        weight.append([i, j, k, l])
        for k in range(1,6):
            weight.append([i, j, k, l])
            for l in range(1,6):
                weight.append([i, j, k, l])

highest_score = 0
best_weight = [[0,0,0,0]]
also_best_weights = []

for x in range(10):
    for i in weight:
        train_x, train_y, test_x, test_y = PCA_dim_reduction('df_train.csv', 'df_test.csv', 5)

        meta_model = VotingClassifier(estimators=base_models, voting = 'soft', weights = i)
        meta_model.fit(train_x, train_y)

        if meta_model.score(test_x, test_y) == highest_score:
            also_best_weights.append(i)

        if meta_model.score(test_x, test_y) > highest_score:
            highest_score = meta_model.score(test_x, test_y)
            best_weight[0] = i
            also_best_weights = []

print(best_weight, highest_score, also_best_weights)

[[3, 4, 4, 3]] 0.8615384615384616 [[3, 4, 5, 3]]


# GridSearch for finetuning hyperparameters in Voting Classifier
### The grid search will try all given combinations of hyperparameters and evaluate the best model based on a given scoring metric (in this case accuracy).

In [15]:


# Our base models/classifiers
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=1)
clf3 = SVC(probability=True)  # Probability set to True to enable soft voting (prediction probability based voting)
clf4 = KNeighborsClassifier()

# Stored in a list with a label to access them
base_models = [('QDA', clf1), ('DT', clf2), ('SVC', clf3), ('KNN', clf4)]

# Framework for voting classifier with soft voting enabled and our vanilla estimators
clf = VotingClassifier(estimators=base_models, voting='soft')

# A dictionary containing the hyperparameter names and the values we wish to include in the combinations
params = {'DT__max_depth':range(1,10),
          'DT__min_samples_split':range(1,10),
          'SVC__C':[1, 10, 100, 1000],
          'SVC__gamma':[0.1, 0.01, 0.001, 0.0001, 0.00001],
          'KNN__n_neighbors':range(1,15)
}

best_models = []
grid = GridSearchCV(clf, params, cv=5, verbose=1, n_jobs=-1)

grid.fit(train_x, train_y)

best_model = grid.best_estimator_

best_model.fit(train_x, train_y)
    
best_models.append((best_model.score(test_x, test_y), grid.best_estimator_))

print(best_models)

Fitting 5 folds for each of 22680 candidates, totalling 113400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 340 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 3040 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 7540 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 13840 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 21940 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 31840 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 43540 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 57040 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 72340 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 89440 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 108340 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 113400 out of 113400 | elapsed:  5.9min finished


[(0.7846153846153846, VotingClassifier(estimators=[('QDA', QuadraticDiscriminantAnalysis()),
                             ('DT',
                              DecisionTreeClassifier(max_depth=7,
                                                     min_samples_split=5)),
                             ('SVC', SVC(C=1000, gamma=0.1, probability=True)),
                             ('KNN', KNeighborsClassifier(n_neighbors=1))],
                 voting='soft'))]


# HIGHEST SCORING MODELS

In [16]:
def best_weighting(base_models):
    
    weight = []
    i, j, k, l = 1,1,1,1
    for i in range(1,6):
        weight.append([i, j, k, l])
        for j in range(1,6):
            weight.append([i, j, k, l])
            for k in range(1,6):
                weight.append([i, j, k, l])
                for l in range(1,6):
                    weight.append([i, j, k, l])
                    
    highest_score = 0
    best_weight = [[0,0,0,0]]
    also_best_weights = []

    for x in range(5):
        for i in weight:
            train_x, train_y, test_x, test_y = PCA_dim_reduction('df_train.csv', 'df_test.csv', 5)

            meta_model = VotingClassifier(estimators=base_models, voting = 'soft', weights = i)
            meta_model.fit(train_x, train_y)

            if meta_model.score(test_x, test_y) == highest_score:
                also_best_weights.append(i)

            if meta_model.score(test_x, test_y) > highest_score:
                highest_score = meta_model.score(test_x, test_y)
                best_weight[0] = i
                also_best_weights = []
                
    return (base_models, best_weight, highest_score, also_best_weights)

model1 = best_weighting([('QDA', QuadraticDiscriminantAnalysis()),
                         ('DT', DecisionTreeClassifier(max_depth=2, min_samples_split=8)),
                         ('SVC', SVC(C=1000, gamma=0.01, probability=True)),
                         ('KNN', KNeighborsClassifier(n_neighbors=2))])

model2 = best_weighting([('QDA', QuadraticDiscriminantAnalysis()),
                         ('DT', DecisionTreeClassifier(max_depth=2, min_samples_split=9)),
                         ('SVC', SVC(C=1000, gamma=0.01, probability=True)),
                         ('KNN', KNeighborsClassifier(n_neighbors=2))])

model3 = best_weighting([('QDA', QuadraticDiscriminantAnalysis()),
                         ('DT', DecisionTreeClassifier(max_depth=2, min_samples_split=4)),
                         ('SVC', SVC(C=1000, gamma=0.01, probability=True)),
                         ('KNN', KNeighborsClassifier(n_neighbors=2))])

# Model weight evaluation

In [17]:
train_x, train_y, test_x, test_y = PCA_dim_reduction('df_train.csv', 'df_test.csv', 5)

def average_acc_eval(basemodels, weight, n):
    
    train_x, train_y, test_x, test_y = PCA_dim_reduction('df_train.csv', 'df_test.csv', 5)
    
    sums = 0
    for i in range(n):
        meta_model = VotingClassifier(estimators=base_models, voting = 'soft', weights=[3, 3, 4, 3])

        meta_model.fit(train_x, train_y)

        pred_y = meta_model.predict(test_x)

        correct = 0
        
        for i in range(len(test_y)):
            if pred_y[i] == test_y[i]:
                correct+=1

        acc = correct / len(test_y)

        sums += acc

    return sums/n

# Model 1
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=2, min_samples_split=7)
clf3 = SVC(C=1000, gamma=0.01, probability=True)
clf4 = KNeighborsClassifier(n_neighbors=2)

base_models = [('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)]

weights_model1 = [[3, 3, 4, 3]]

avg_model1 = []
for weight in weights_model1:
    avg_model1.append(average_acc_eval([('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)], weight, 100))


# Model 2
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=2, min_samples_split=8)
clf3 = SVC(C=1000, gamma=0.01, probability=True)
clf4 = KNeighborsClassifier(n_neighbors=2)

base_models = [('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)]

weights_model2 = [[1, 1, 2, 1], [3, 3, 5, 2], [4, 4, 5, 3], [3, 3, 5, 3], [2, 2, 4, 2], [2, 2, 5, 2]]

avg_model2 = []
for weight in weights_model2:
    avg_model2.append(average_acc_eval([('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)], weight, 100))



# Model 3
clf1 = QuadraticDiscriminantAnalysis()
clf2 = DecisionTreeClassifier(max_depth=2, min_samples_split=9)
clf3 = SVC(C=1000, gamma=0.01, probability=True)
clf4 = KNeighborsClassifier(n_neighbors=2)

base_models = [('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)]

weights_model3 = [[2, 2, 3, 2], [1, 1, 3, 1], [2, 2, 4, 2], [3, 3, 5, 3], [2, 3, 3, 2], [3, 3, 4, 3], [2, 2, 4, 2]]

avg_model3 = []
for weight in weights_model3:
    avg_model3.append(average_acc_eval([('QDA', clf1), ('DT', clf2), ('SVM', clf3), ('KNN', clf4)], weight, 100))


#model2=[avg_acc_m2_w1, avg_acc_m2_w2, avg_acc_m2_w3, avg_acc_m2_w4, avg_acc_m2_w5, avg_acc_m2_w6]
#model3=[avg_acc_m3_w1, avg_acc_m3_w2, avg_acc_m3_w3, avg_acc_m3_w4, avg_acc_m3_w5, avg_acc_m3_w6, avg_acc_m3_w7, avg_acc_m3_w8]

print("\n\nMODEL 1:\n")
for i in range(len(avg_model1)):
    print(f"average accuracy for model 1 weight {i+1}: {avg_model1[i]}")
    
print("\n\nMODEL 2:\n")
for i in range(len(avg_model2)):
    print(f"average accuracy for model 2 weight {i+1}: {avg_model3[i]}")
print("\n\nMODEL 3:\n")
for i in range(len(avg_model2)):
    print(f"average accuracy for model 3 weight {i+1}: {avg_model3[i]}")



MODEL 1:

average accuracy for model 1 weight 1: 0.8361538461538452


MODEL 2:

average accuracy for model 2 weight 1: 0.8347692307692303
average accuracy for model 2 weight 2: 0.8361538461538457
average accuracy for model 2 weight 3: 0.8344615384615384
average accuracy for model 2 weight 4: 0.8370769230769227
average accuracy for model 2 weight 5: 0.8346153846153844
average accuracy for model 2 weight 6: 0.8349230769230767


MODEL 3:

average accuracy for model 3 weight 1: 0.8347692307692303
average accuracy for model 3 weight 2: 0.8361538461538457
average accuracy for model 3 weight 3: 0.8344615384615384
average accuracy for model 3 weight 4: 0.8370769230769227
average accuracy for model 3 weight 5: 0.8346153846153844
average accuracy for model 3 weight 6: 0.8349230769230767
