In [1]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import os
import seaborn
seaborn.set()

In [2]:
encoded = np.load("./npy_data/data_encoded_d.npy")

In [3]:
leftEyeList = np.load("./npy_data/leftEyeList.npy")

In [4]:
# run a single fitting and see how well it does
svc_test = LinearSVC(penalty='l1', class_weight='balanced', 
                     C=.06, dual=False, verbose=1, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(encoded, leftEyeList, test_size=0.2, random_state=2)
svc_test.fit(X_train, y_train)

[LibLinear]

LinearSVC(C=0.06, class_weight='balanced', dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=1)

In [5]:
print(X_train.shape, X_test.shape)

((48, 26241110), (13, 26241110))


In [6]:
y_pred = svc_test.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

1.0


In [7]:
y_train

array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1])

In [8]:
from sklearn.externals import joblib
joblib.dump(svc_test, "svc.pkl")

['svc.pkl']

In [9]:
# retrieve all the nonzero coefficients and zip them with their respective indices
nonzeroes = np.nonzero(svc_test.coef_[0])[0]
coefs = zip(nonzeroes, svc_test.coef_[0][nonzeroes])

# sort the coefficients by their value, instead of index
coefs.sort(key = lambda x: x[1], reverse=True)

for coef in coefs:
    print coef

(20431281, 0.6158805814769369)
(6125509, -0.009057836796319638)
(20431280, -0.49564004547313695)


In [None]:
crange = np.logspace(-2, 1, 5).tolist()
means = []
stds = []
for Cval in crange:
    svc_test = LinearSVC(penalty='l1', class_weight='balanced', C=Cval, dual=False, verbose=1, max_iter=1000)
    cv_score = cross_val_score(svc_test, encoded, leftEyeList, cv=10, scoring='accuracy')
    means.append(cv_score.mean())
    stds.append(cv_score.std())
    print "c:", Cval, "mean:", cv_score.mean(), "std:", cv_score.std()

In [None]:
lines = plt.errorbar(crange, means, stds, marker='o', linewidth=0, elinewidth=1, capsize=20)
plt.xscale("log")
plt.title("Grid Search Cross Validation Results")
plt.setp(lines, color='r')
plt.xlabel("C value")
plt.ylabel("Accuracy")
    
for a, b in zip(crange, means): 
    plt.text(a, b+.02, np.around(b, decimals=2))


for cap in lines[1]:
    cap.set_markeredgewidth(1)
    cap.set_markersize(5)
    
plt.tight_layout()
plt.gcf().subplots_adjust(right=1.3)
plt.show()
plt.savefig('grid_search_results.png', dpi=300)