In [7]:
import numpy as np

X_train = np.loadtxt("Data/X_train_binary.csv", delimiter=",")
X_test = np.loadtxt("Data/X_test_binary.csv", delimiter=",")

y_train = np.loadtxt("Data/y_train1-1.csv", delimiter=",")
y_test = np.loadtxt("Data/y_test1-1.csv", delimiter=",")

In [8]:
print(f"Shape of training data: {X_train.shape}")
print(f"Shape of test data: {X_test.shape}\n")

freq = sum(y_train)
LEN = X_train.shape[0]
print(f"Class frequencies: 1 ({(LEN/2 + freq)/LEN}), -1 ({(LEN/2 - freq)/LEN})")

Shape of training data: (150, 61)
Shape of test data: (164, 61)

Class frequencies: 1 (0.5933333333333334), -1 (0.4066666666666667)


## Normalization
We obtain the mean and variance of each feature/column of the training data and normalize to 0 mean and 1 variance.

In [16]:
def normalize(data, mu, var):
    return (data - mu)/var

X_train_mu = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)

X_train_norm = normalize(X_train, X_train_mu, X_train_std)
X_test_norm = normalize(X_test, X_train_mu, X_train_std)

Mean and variance of "normalized" test set.

In [12]:
import pandas as pd

X_test_mu_var = np.append(
    X_test_norm.mean(axis=0)[:, np.newaxis], 
    X_test_norm.std(axis=0)[:, np.newaxis], axis=1)

pd.DataFrame(X_test_mu_var, columns=["mean", "variance"])

Unnamed: 0,mean,variance
0,0.090449,1.388890
1,0.165786,2.697616
2,-0.063152,0.886257
3,-0.080217,0.860856
4,-0.037857,0.924941
...,...,...
56,0.137408,1.033535
57,0.136607,1.101414
58,0.041499,1.318110
59,-0.010897,1.367841


## Auxilliary function for 5-fold cross validation

In [5]:
def split(X, y, folds=5):
    pct = len(X) / folds
    
    retval = []
    for i in range(folds):
        indices = np.arange(i*pct, (i+1)*pct, dtype=int)
        
        X_train, y_train = np.delete(X, indices, axis=0), np.delete(y, indices)
        X_val, y_val = X[indices], y[indices]
        
        retval.append([X_train, y_train, X_val, y_val])
        
    return np.array(retval, dtype=object)

# SVM Classifier
svm_clf = SVC(C=1, kernel='rbf', gamma=float)

In [6]:
from sklearn.svm import SVC

C = [0.01, 0.1, 1, 10, 50, 100, 1000]
gamma = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

splits = split(X_train_norm, y_train)

for c in C:
    for g in gamma:
        print(f"C={c}, gamma={g}")
        auc_ = []
        for i, (X, y, X_val, y_val) in enumerate(splits):
            svm_clf = SVC(C=c, kernel='rbf', gamma=g)
            svm_clf.fit(X, y)
            
            auc = svm_clf.score(X_val, y_val)
            auc_.append(auc)
            print("FOLD {} - 0-1 Loss: {:.2f}".format((i + 1),(1 - auc)))
            
        print(f"\n MEAN LOSS: {1 - np.mean(auc_)}\n")
        

C=0.01, gamma=0.0001
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

 MEAN LOSS: 0.45333333333333337

C=0.01, gamma=0.001
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

 MEAN LOSS: 0.45333333333333337

C=0.01, gamma=0.01
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

 MEAN LOSS: 0.45333333333333337

C=0.01, gamma=0.1
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

 MEAN LOSS: 0.45333333333333337

C=0.01, gamma=1
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

 MEAN LOSS: 0.45333333333333337

C=0.01, gamma=10
FOLD 1 - 0-1 Loss: 0.40
FOLD 2 - 0-1 Loss: 0.57
FOLD 3 - 0-1 Loss: 0.30
FOLD 4 - 0-1 Loss: 0.43
FOLD 5 - 0-1 Loss: 0.57

In [7]:
C=50
gamma=0.0001

svm_clf = SVC(C=C, kernel='rbf', gamma=gamma)
svm_clf.fit(X_train_norm, y_train)
test_auc = svm_clf.score(X_test_norm, y_test)

print(f"TEST DATA - 0-1 Loss: {1 - test_auc}")

TEST DATA - 0-1 Loss: 0.2195121951219512


In [9]:
len(svm_clf.support_vectors_)

75