# Loading libraries and data

In [8]:
# Importing libraries
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [11]:
# Loading data
train, test = "data/df_train.csv", "data/df_test.csv"
train = np.loadtxt(train, skiprows=1, delimiter=",")
test = np.loadtxt(test, skiprows=1, delimiter=",")

# Performance Metrics Functions

In [12]:
# Function that computes a confusion matrix which is used to compute the below functions
def cm_maker(y, ypred, n_classes):
    
    low = [1, 2, 3]
    high = [5, 6, 7]
    
    cm = np.zeros((n_classes, n_classes))
    
    for i, j in zip(ypred, y):
        
        if i in low:
            i = i - 1
        if i in high:
            i = i - 2
            
        if j in low:
            j = j - 1
        if j in high:
            j = j - 2
            
        cm[int(i), int(j)] += 1
        
    return cm


# Function computes precision score
def preci(cm, c):
    
    if sum(cm[c,:]) == 0:
        return 0
    else:
        return cm[c,c]/sum(cm[c,:])


# Function computes recall score
def recall(cm, c):
    
    return cm[c,c]/sum(cm[:,c])


# Function computes f1-score
def f1(cm, c):
    if (preci(cm,c) + recall(cm,c)) == 0:
        return 0
    else: 
        return 2 * (preci(cm,c) * recall(cm,c)) / (preci(cm,c) + recall(cm,c))
    

# Function computes weighted f1-score
def weighted_f1(cm, n_classes):
    co_su=cm.sum(axis=0)
    n=cm.sum()
    
    weighted_f1_sum = 0
    
    for c in range(n_classes):
        if co_su[c] != 0:
            weighted_f1_sum += f1(cm, c) * co_su[c] / n

    return round(weighted_f1_sum, 3)
        

# Function computes macro f1-score
def macro_f1(cm, n_classes):
    
    f1_sum = 0
    
    for i in range(n_classes):
        f1_sum += f1(cm, i)
    
    return round(f1_sum / n_classes, 3)


# Function to get accuracy
def accuracy(test_y, ypred):
    
    # Count of times where true labels equal predictions
    true_positives = 0
    for i in range(len(test_y)):
        if test_y[i] == ypred[i]:
            true_positives += 1
    
    return true_positives / len(test_y)


# Combines functions above for a coherent performance report
def performance_report(test_y, ypred, n_classes):
    
    class_labels = [1, 2, 3, 5, 6, 7]
    cm = cm_maker(test_y, ypred, n_classes)
    
    print('\nConfusion matrix for prediction:\n', cm)
    print('\n\nAccuracy for prediction:\n', accuracy(test_y, ypred))
    
    print('\n\nMetrics for classes')
    print('_______________________________________________________________________________')
    print('Class\t|\tPrecision\t|\tRecall\t\t|\tF1 Score')
    print('_______________________________________________________________________________')
    
    for i in range(n_classes):
        
        print('\nClass', class_labels[i],'|\t',round(preci(cm, i), 3),'\t\t|\t',round(recall(cm, i), 3),'\t\t|\t',round(f1(cm, i), 3))
    
    print('\n\nWeighted F1 score:\n', weighted_f1(cm, n_classes))
    print('\nMacro F1 score:\n', macro_f1(cm, n_classes))

# Cross validation - Multiclass Classification

In [4]:
import splitter as sp

In [5]:
sets = sp.n_folder(5,train,shuffle_before_split=True)

In [6]:
cross_accuracies = []

for i in range(1,6):
    #print("i:",i)
    for j in range(1,10):
        #print("j:",j)
        for k in range(1,12):
            #print("k:",k)
            for l in range(1,4):
                if l == 1:
                    kernel_ = "poly"
                if l == 2:
                    kernel_ = "linear"
                if l == 3:
                    kernel_ = "rbf"

                k_ = k/3

                cv_acc = []
                for idx, item in enumerate(sets):
                    cross_test, cross_train = item  

                    cross_svm = Pipeline((("scaler", StandardScaler()),
                                         ("svm_clf", SVC(kernel=kernel_,
                                                        degree=i, C=k_, gamma=100/(10**j)))))   # coef0=j

                    cross_svm.fit(cross_train[:,:-1], cross_train[:,-1])
                    cross_y_predicted = cross_svm.predict(cross_test[:,:-1])
                    cross_y_actual = cross_test[:,-1]

                    cross_accuracy = accuracy_(cross_y_actual, cross_y_predicted)
                    cv_acc.append(cross_accuracy)


                cross_accuracies.append([l, i, k_, 100/(10**j), np.mean(cv_acc)])

    
a = np.array(cross_accuracies)
sorted_a = a[np.argsort(a[:, -1])]

print(sorted_a)

[[1.         2.         3.33333333 0.01       0.26896552]
 [1.         2.         2.66666667 0.01       0.27586207]
 [1.         2.         3.66666667 0.01       0.27586207]
 ...
 [3.         5.         1.66666667 0.1        0.64137931]
 [3.         3.         1.66666667 0.1        0.64137931]
 [3.         4.         1.66666667 0.1        0.64137931]]


From the cross validation test we can see that <b>kernel=rbf</b>, <b>degree=n/a</b>, <b>C=4/3</b> and <b>gamma=0.1</b> yielded the best results for multiclass with an <b>accuracy of ~0.66</b>

# Final evaluation - Multiclass Classification

In [13]:
svm = Pipeline((("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="rbf",
                        degree=5, C=4/3, gamma=0.1))))
                
svm.fit(train[:,:-1], train[:,-1])
y_predicted = svm.predict(test[:,:-1])
y_actual = test[:,-1]
        
performance_report(y_actual, y_predicted, n_classes=6)


Confusion matrix for prediction:
 [[18.  1.  2.  0.  1.  0.]
 [ 3. 19.  3.  1.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  2.  0.  3.  0.  0.]
 [ 0.  1.  0.  0.  2.  0.]
 [ 0.  0.  0.  0.  0.  8.]]


Accuracy for prediction:
 0.7692307692307693


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.818 		|	 0.857 		|	 0.837

Class 2 |	 0.704 		|	 0.826 		|	 0.76

Class 3 |	 0 		|	 0.0 		|	 0

Class 5 |	 0.6 		|	 0.75 		|	 0.667

Class 6 |	 0.667 		|	 0.667 		|	 0.667

Class 7 |	 1.0 		|	 0.889 		|	 0.941


Weighted F1 score:
 0.742

Macro F1 score:
 0.645


# Cross validation - Binary Classification

In [14]:
# Splitting data to Window (1) and non-Window (0):

# Loading data
train2, test2 = "data/df_train.csv", "data/df_test.csv"
train2 = np.loadtxt(train2, skiprows=1, delimiter=",")
test2 = np.loadtxt(test2, skiprows=1, delimiter=",")

for idx,i in enumerate(train2):
    if i[-1] == 1 or i[-1] == 2 or i[-1] == 3:
        train2[idx,-1] = 1
    else:
        train2[idx,-1] = 2
        
for idx,i in enumerate(test2):
    if i[-1] == 1 or i[-1] == 2 or i[-1] == 3:
        test2[idx,-1] = 1
    else:
        test2[idx,-1] = 2

In [15]:
sets = sp.n_folder(5,train2,shuffle_before_split=True)

In [16]:
cross_accuracies = []

for i in range(1,6):
    #print("i:",i)
    for j in range(1,10):
        #print("j:",j)
        for k in range(1,12):
            #print("k:",k)
            for l in range(1,4):
                if l == 1:
                    kernel_ = "poly"
                if l == 2:
                    kernel_ = "linear"
                if l == 3:
                    kernel_ = "rbf"

                k_ = k/3

                cv_acc = []
                for idx, item in enumerate(sets):
                    cross_test, cross_train = item  

                    cross_svm = Pipeline((("scaler", StandardScaler()),
                                         ("svm_clf", SVC(kernel=kernel_,
                                                        degree=i, C=k_, gamma=100/(10**j)))))   # coef0=j

                    cross_svm.fit(cross_train[:,:-1], cross_train[:,-1])
                    cross_y_predicted = cross_svm.predict(cross_test[:,:-1])
                    cross_y_actual = cross_test[:,-1]

                    cross_accuracy = accuracy_(cross_y_actual, cross_y_predicted)
                    cv_acc.append(cross_accuracy)


                cross_accuracies.append([l, i, k_, 100/(10**j), np.mean(cv_acc)])

    
a = np.array(cross_accuracies)
sorted_a = a[np.argsort(a[:, -1])]

print(sorted_a)

KeyboardInterrupt: 

From the cross validation test we can see that <b>kernel=rbf</b>, <b>degree=n/a</b>, <b>C=2/3</b> and <b>gamma=0.1</b> yielded the best results for multiclass with an <b>accuracy of ~0.96</b>

# Final evaluation - Binary Classification

In [19]:
bi_svm = Pipeline((("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="rbf",
                        degree=1, C=2/3, gamma=0.1))))
                
bi_svm.fit(train2[:,:-1], train2[:,-1])
bi_y_predicted = bi_svm.predict(test2[:,:-1])
bi_y_actual = test2[:,-1]
        
performance_report(bi_y_actual, bi_y_predicted, n_classes=2)


Confusion matrix for prediction:
 [[46.  3.]
 [ 3. 13.]]


Accuracy for prediction:
 0.9076923076923077


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.939 		|	 0.939 		|	 0.939

Class 2 |	 0.812 		|	 0.812 		|	 0.812


Weighted F1 score:
 0.908

Macro F1 score:
 0.876


# Warm-up SVM training (without cross-validation)

In [None]:
X = train[:,:-1]
y = train[:,-1]

In [None]:
def SupportVectorMachines(X,y):
    scores = []
    best_hyp = [0,0,0]
    
    for i in range(1,10):
        C_ = i/2
        
        for j in range(2,6):
            coef0_ = j
            
            for k in range(2,7):
                degree_ = k
                
                clf = Pipeline((("scaler", StandardScaler()),
                                 ("svm_clf", SVC(kernel="poly",
                                                degree=degree_, coef0=coef0_, C=C_))))
                classifier = clf.fit(X,y)
                score = cross_val_score(classifier, X,y, cv=5, n_jobs=1).mean()
                scores.append(score)

                print("new iteration: ", C_, coef0_)
                print(score, max(scores))

                if score >= max(scores):
                    best_hyp[0] = C_
                    best_hyp[1] = coef0_
                    best_hyp[2] = degree_

    return scores, best_hyp

In [None]:
scores, best_hyp = SupportVectorMachines(X,y)

In [None]:
sorted(scores)

In [None]:
best_hyp

In [None]:
def SupportVectorMachines(X,y):
    scores = []
    C_list = []
    coef_list = []
    mapping = {}
    
    for i in range(1,10):
        C_ = i/2
        C_list.append(C_)
        
        for j in range(2,6):
            coef0_ = j
            coef_list.append(coef0_)
                
            clf = Pipeline((("scaler", StandardScaler()),
                             ("svm_clf", SVC(kernel="poly",
                                            degree=2, coef0=coef0_, C=C_))))
            classifier = clf.fit(X,y)
            y_predicted = clf.predict(test[:,:-1])

            score = accuracy_(test[:,-1], y_predicted)
            scores.append(score)
        
    return scores, C_list, coef_list

In [None]:
scores, C_list, coef_list = SupportVectorMachines(X,y)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter

In [None]:
fig = plt.figure()
ax = fig.gca(projection='3d')

In [None]:
X_, Y_ = np.meshgrid(C_list, coef_list)
Z_ = X_
for idx,i in enumerate(Z_):
    for idx2,j in enumerate(i):
        

In [None]:
Y_

In [None]:
surf = ax.plot_surface(X_, Y_, Z=np.array(scores))