In [1]:
from sklearn.svm import SVC, LinearSVC
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import os
import seaborn
seaborn.set()

In [11]:
def printCoefs(classifier):
    # retrieve all the nonzero coefficients and zip them with their respective indices
    nonzeroes = np.nonzero(classifier.coef_[0])[0]
    coefs = zip(nonzeroes, classifier.coef_[0][nonzeroes])

    # sort the coefficients by their value, instead of index
    coefs.sort(key = lambda x: x[1], reverse=True)

    for coef in coefs[:50]:
        print coef

In [2]:
encoded = np.load("./npy_data/data_encoded_d.npy")

In [3]:
blood_types = np.load('./npy_data/blood_types.npy')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(encoded, blood_types, test_size=0.2)

In [6]:
# C = 0.02  # SVM regularization parameter
classifier = LinearSVC(class_weight='balanced', penalty='l1', dual=False, C=.06, verbose=1)
svc = classifier.fit(X_train, y_train)

[LibLinear]

In [7]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9375


In [8]:
y_pred

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0])

In [9]:
y_test

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0])

In [12]:
printCoefs(svc)

(14151619, 0.722222222222222)
(14151618, -0.7076023391812866)


In [None]:
svc_test = LinearSVC(penalty="l1", class_weight='balanced', dual=False, C=0.02)
cv_scores = cross_val_score(svc_test, encoded, blood_types, cv=10, scoring='accuracy')
print(cv_scores.mean())
print(cv_scores.std())

In [None]:
# Fit with dask-searchcv - doesn't work without a significant amount of ram
from dask_ml.model_selection import GridSearchCV
crange = np.logspace(-2, 1, 10).tolist()
param_space = {'C': crange,
               'class_weight': [None, 'balanced']}

model = LinearSVC(penalty='l1', dual=False, verbose=1, max_iter=1000)

search = GridSearchCV(model, param_space, cv=5, n_jobs=5)
search.fit(encoded, blood_types)
print(search.best_params_)