In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn import svm

In [2]:
# Load the dataset
test_feature = pd.read_csv("question-2-test-features.csv", header=None)
test_label = pd.read_csv("question-2-test-labels.csv", header=None)
train_feature = pd.read_csv("question-2-train-features.csv", header=None)
train_label = pd.read_csv("question-2-train-labels.csv", header=None);

In [3]:
def create_folds(array, count):
    folds = []
    size = np.shape(array)[0]
    fold_size = int(size / count)
    for i in range(10):
        piece1 = array[0:i*fold_size]
        piece2 = array[(i+1)*fold_size:size]
        folds.append(np.concatenate((piece1,piece2)))
    return np.array(folds)

In [4]:
############################################## TRAINING DATA PRE-PROCESSING ####################################################

In [5]:
# Obtain training data and label as whole
x_train = train_feature.values
y_train = train_label.values
y_train[y_train < 190] = 0 
y_train[y_train >= 190] = 1
y_train = np.transpose(y_train)[0]

In [6]:
# Divide data and label into folds
x_train_folds = create_folds(x_train, 10)
y_train_folds = create_folds(y_train, 10)

In [11]:
singular_x_folds = np.array(np.split(x_train, 10))
singular_y_folds = np.array(np.split(y_train,10))

In [19]:
# Obtain test data and label
x_test = test_feature.values
y_test = test_label.values
y_test[y_test < 190] = 0 
y_test[y_test >= 190] = 1
y_test = np.transpose(y_test)[0]

In [20]:
###################################################### LINEAR SVM ##############################################################

In [21]:
C_interval = [0.001,0.01,0.1,1,10,100]
gamma_interval = [1/16,1/8,1/4,1/2,1,2]

In [22]:
# Train model with different C values with 10-fold cross validation
train_accuracy = np.zeros((len(C_interval),10))
for c in range(len(C_interval)):
    classifier = svm.LinearSVC(C=C_interval[c], random_state=0)
    for i in range(10):
        classifier.fit(x_train_folds[i], y_train_folds[i])
        # Train prediction and accuracy
        y_pred = classifier.predict(singular_x_folds[i])
        train_accuracy[c][i] = np.sum(y_pred==singular_y_folds[i]) / len(singular_y_folds[i]) * 100

In [23]:
# Find optimal C value
training_mean_accuracy = np.mean(train_accuracy, axis=1)
optimal_c = C_interval[np.argmax(training_mean_accuracy)]

In [24]:
training_mean_accuracy

array([71.05714286, 75.07142857, 75.32857143, 73.44285714, 65.88571429,
       69.51428571])

In [25]:
# Test prediction and accuracy
classifier = svm.LinearSVC(C=optimal_c, random_state=0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
test_accuracy = np.sum(y_pred==y_test) / len(y_test) * 100
print("Test accuracy = " + str(test_accuracy))

Test accuracy = 69.41977501480166


In [26]:
# Confusion Matrix values
tp = int(np.sum(y_pred * y_test))
tn = int(np.sum(y_pred+y_test==0))
fp = int(np.sum(y_pred-y_test==1))
fn = int(np.sum(y_test-y_pred==1))

print("True Positive: " + str(tp))
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("Total = " + str(tp+fp+fn+tn))

print()
print("Precision: " + str(tp/(tp+fp)))
print("Recall: " + str(tp/(tp+fn)))
print("NPV: " + str(tn/(tn+fn)))
print("FPR: " + str(fp/(fp+tn)))
print("FDR: " + str(fp/(tp+fp)))
print("F1: " + str((2*tp/(tp+fp))/(tp/(tp+fp)+tp/(tp+fn))))
print("F2: " + str((5*tp/(tp+fp))/(4*tp/(tp+fp)+tp/(tp+fn))))

True Positive: 836
True Negative: 1509
False Positive: 104
False Negative: 929
Total = 3378

Precision: 0.8893617021276595
Recall: 0.4736543909348442
NPV: 0.6189499589827727
FPR: 0.06447613143211407
FDR: 0.11063829787234042
F1: 1.3049907578558226
F2: 1.1031250000000001


In [27]:
##################################################### RBF SVM ##################################################################

In [28]:
# Train model with different C values with 10-fold cross validation
train_accuracy_rbf = np.zeros((len(gamma_interval),10))
for g in range(len(gamma_interval)):
    classifier = svm.SVC(C=1e4,gamma=gamma_interval[g], kernel='rbf')
    for i in range(10):
        classifier.fit(x_train_folds[i], y_train_folds[i])
        # Train prediction and accuracy
        y_pred = classifier.predict(singular_x_folds[i])
        train_accuracy_rbf[g][i] = np.sum(y_pred==singular_y_folds[i]) / len(singular_y_folds[i]) * 100

In [29]:
# Find optimal gamma value
training_mean_accuracy_rbf = np.mean(train_accuracy_rbf, axis=1)
optimal_gamma = gamma_interval[np.argmax(training_mean_accuracy_rbf)]

In [30]:
# Test prediction and accuracy
classifier = svm.SVC(C=1e4, gamma=optimal_gamma, kernel='linear')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
test_accuracy = np.sum(y_pred==y_test) / len(y_test) * 100
print("Test accuracy rbf = " + str(test_accuracy))

Test accuracy rbf = 70.54470100651274


In [31]:
# Confusion Matrix values
tp = int(np.sum(y_pred * y_test))
tn = int(np.sum(y_pred+y_test==0))
fp = int(np.sum(y_pred-y_test==1))
fn = int(np.sum(y_test-y_pred==1))

print("True Positive: " + str(tp))
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("Total = " + str(tp+fp+fn+tn))

print()
print("Precision: " + str(tp/(tp+fp)))
print("Recall: " + str(tp/(tp+fn)))
print("NPV: " + str(tn/(tn+fn)))
print("FPR: " + str(fp/(fp+tn)))
print("FDR: " + str(fp/(tp+fp)))
print("F1: " + str((2*tp/(tp+fp))/(tp/(tp+fp)+tp/(tp+fn))))
print("F2: " + str((5*tp/(tp+fp))/(4*tp/(tp+fp)+tp/(tp+fn))))

True Positive: 843
True Negative: 1540
False Positive: 73
False Negative: 922
Total = 3378

Precision: 0.9203056768558951
Recall: 0.4776203966005666
NPV: 0.6255077173030057
FPR: 0.04525728456292622
FDR: 0.07969432314410481
F1: 1.3166728832525176
F2: 1.1064443329989968
