In [1]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import os
import seaborn
seaborn.set()

In [2]:
encoded = np.load("./npy_data/data_encoded_d.npy")

In [3]:
leftEyeList = np.load("./npy_data/leftEyeList.npy")

In [5]:
# run a single fitting and see how well it does
svc_test = LinearSVC(penalty='l1', class_weight='balanced', 
                     C=.06, dual=False, verbose=1, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(encoded, leftEyeList, test_size=0.2, random_state=2)
svc_test.fit(X_train, y_train)
y_pred = svc_test.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

[LibLinear]0.875


In [4]:
# run a single fitting and see how well it does
svc_test = LinearSVC(penalty='l1', class_weight=None, 
                     C=.06, dual=False, verbose=1, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(encoded, leftEyeList, test_size=0.2, random_state=2)
svc_test.fit(X_train, y_train)
y_pred = svc_test.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

[LibLinear]0.9375


In [7]:
np.nonzero(svc_test.coef_)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([  893330,  2147838,  5467982,  6214939,  8580740, 10654598,
        12110356, 12752822, 13235098, 13933181, 14043795, 14379751,
        15438902, 18635721, 20674455, 24884118, 25117658]))

In [None]:
# run a single fitting and see how well it does
svc_test = LinearSVC(penalty='l1', class_weight=None, 
                     C=.06, dual=False, verbose=1, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(encoded, leftEyeList, test_size=0.2, random_state=2)
svc_test.fit(X_train, y_train)

y_pred = svc_test.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

In [8]:
y_train

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1])

In [9]:
from sklearn.externals import joblib
joblib.dump(svc_test, "svc.pkl")

['svc.pkl']

In [10]:
# retrieve all the nonzero coefficients and zip them with their respective indices
nonzeroes = np.nonzero(svc_test.coef_[0])[0]
coefs = zip(nonzeroes, svc_test.coef_[0][nonzeroes])

# sort the coefficients by their value, instead of index
coefs.sort(key = lambda x: x[1], reverse=True)

for coef in coefs:
    print coef

(20674455, 0.5919716238171446)
(5467982, 0.08607034243323428)
(13933181, 0.0520436158134867)
(13235098, 0.04390505706562038)
(24884118, 0.038514661882012365)
(12110356, 0.02393485511986009)
(10654598, 0.02001787436042732)
(18635721, 0.0040016483120448665)
(893330, -0.01879369153441716)
(25117658, -0.022580767904878857)
(6214939, -0.03070315360121059)
(8580740, -0.0634672561462652)
(12752822, -0.11724411303521072)
(2147838, -0.11800596215033615)
(15438902, -0.1252833349384073)
(14379751, -0.13125359246731855)
(14043795, -0.16198974801925214)


In [None]:
crange = np.logspace(-2, 1, 5).tolist()
means = []
stds = []
for Cval in crange:
    svc_test = LinearSVC(penalty='l1', class_weight='balanced', C=Cval, dual=False, verbose=1, max_iter=1000)
    cv_score = cross_val_score(svc_test, encoded, leftEyeList, cv=10, scoring='accuracy')
    means.append(cv_score.mean())
    stds.append(cv_score.std())
    print "c:", Cval, "mean:", cv_score.mean(), "std:", cv_score.std()

In [None]:
lines = plt.errorbar(crange, means, stds, marker='o', linewidth=0, elinewidth=1, capsize=20)
plt.xscale("log")
plt.title("Grid Search Cross Validation Results")
plt.setp(lines, color='r')
plt.xlabel("C value")
plt.ylabel("Accuracy")
    
for a, b in zip(crange, means): 
    plt.text(a, b+.02, np.around(b, decimals=2))


for cap in lines[1]:
    cap.set_markeredgewidth(1)
    cap.set_markersize(5)
    
plt.tight_layout()
plt.gcf().subplots_adjust(right=1.3)
plt.show()
plt.savefig('grid_search_results.png', dpi=300)