In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
import sklearn

In [2]:
def printCoefs(classifier):
    # retrieve all the nonzero coefficients and zip them with their respective indices
    nonzeroes = np.nonzero(classifier.coef_[0])[0]
    coefs = zip(nonzeroes, classifier.coef_[0][nonzeroes])

    # sort the coefficients by their value, instead of index
    coefs.sort(key = lambda x: x[1], reverse=True)

    for coef in coefs[:50]:
        print coef

In [3]:
encoded = np.load("./npy_data/data_encoded_d.npy")

In [4]:
blood_types = np.load('./npy_data/blood_types.npy')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(encoded, blood_types, test_size=0.2, random_state=2)

In [69]:
sgd = SGDClassifier(penalty='l1', 
                    alpha=.2,
                    l1_ratio=1,
                    shuffle=True,
                    warm_start=True,
                    learning_rate='optimal', 
                    tol=1e-6,
                    max_iter=200,
                    verbose=1)

In [76]:
for i in range(2000):
    sgd.partial_fit(X_train, y_train, classes=np.unique(blood_types))
    randomized = np.arange(X_train.shape[0])
    np.random.shuffle(randomized)
    X_train = X_train[randomized]
    y_train = y_train[randomized]
    y_pred = sgd.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {}%".format(acc * 100))
    if acc == 1.0 and np.nonzero(sgd.coef_[0])[0].shape[0] <= 5:
        print("100% accuracy achieved with less than 10 nonzeros. Stopping after {} epochs".format(i))
        break

-- Epoch 1
Norm: 1099.19, NNZs: 69, Bias: -0.084780, T: 63, Avg. loss: 8450.261111
Total training time: 16.60 seconds.
Accuracy: 93.75%
-- Epoch 1
Norm: 635.91, NNZs: 25, Bias: -0.116245, T: 63, Avg. loss: 3647.409367
Total training time: 15.85 seconds.
Accuracy: 100.0%
-- Epoch 1
Norm: 479.66, NNZs: 56, Bias: -0.117337, T: 63, Avg. loss: 2333.615045
Total training time: 15.82 seconds.
Accuracy: 87.5%
-- Epoch 1
Norm: 274.97, NNZs: 5, Bias: -0.152147, T: 63, Avg. loss: 0.102106
Total training time: 13.21 seconds.
Accuracy: 100.0%
100% accuracy achieved with less than 10 nonzeros. Stopping after 3 epochs


In [77]:
y_pred = sgd.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


In [78]:
np.bincount(y_test)

array([11,  5])

In [79]:
print(zip(y_test, y_pred))

[(0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1)]


In [80]:
printCoefs(sgd)

(14151619, 0.49851491684458876)
(14151706, 0.3392241038972311)
(14151629, 0.2841179224706821)
(14151760, -0.14579304975023014)
(14151618, -0.7066609728496123)
