In [1]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import os
import seaborn
from sklearn.linear_model import SGDClassifier
seaborn.set()

In [2]:
def printCoefs(classifier):
    # retrieve all the nonzero coefficients and zip them with their respective indices
    nonzeroes = np.nonzero(classifier.coef_[0])[0]
    coefs = zip(nonzeroes, classifier.coef_[0][nonzeroes])

    # sort the coefficients by their value, instead of index
    coefs.sort(key = lambda x: x[1], reverse=True)

    for coef in coefs[:50]:
        print coef

In [3]:
encoded = np.load("./npy_data/data_encoded_d.npy")

In [99]:
encoded.shape

(79, 25019446)

In [4]:
blood_types = np.load('./npy_data/blood_types.npy')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(encoded, blood_types, test_size=0.2)

In [6]:
encoded_mapped = np.memmap('./npy_data/encoded_mapped', mode='r', shape=(79, 25019446), dtype=np.float64)
X_train_, X_test_, y_train_, y_test_ = train_test_split(encoded_mapped, blood_types, test_size=0.2)

In [7]:
def randomize_arrays(arr1, arr2):
    randomize = np.arange(arr1.shape[0])
    np.random.shuffle(randomize)
    return arr1[randomize], arr2[randomize]

In [86]:
sgd = SGDClassifier(penalty='l1', 
                    class_weight={0: .85, 1: 1.25},#'balanced', 
                    alpha=.1,
                    l1_ratio=1,
                    learning_rate='optimal', 
                    tol=1e-6,
                    max_iter=200,
                    verbose=1)

In [None]:
iter_size = 20
X_rand, y_rand = randomize_arrays(X_train, y_train)
sgd = SGDClassifier(penalty='l1', 
                    #class_weight={0: .85, 1: 1.25},#'balanced', 
                    alpha=.1,
                    l1_ratio=.6,
                    learning_rate='constant', 
                    tol=1e-6,
                    max_iter=200,
                    eta0=1e-3,
                    verbose=1)

for _ in range(10):
    X_rand, y_rand = randomize_arrays(X_train, y_train)
    for i in range(0, y_rand.shape[0], iter_size):
        final = i + iter_size
        if final > len(y_rand):
            final = len(y_rand) - 1
        x_iter = X_rand[i:final]
        y_iter = y_rand[i:final]
        #print(y_iter)

        class_weights = compute_class_weight('balanced', np.unique(y_iter), y_iter)
        sample_weights = np.copy(y_iter).astype(np.float64)
        if len(class_weights) != 1:
            sample_weights[np.argwhere(sample_weights == 1)] = sample_weights[np.argwhere(sample_weights == 1)] * class_weights[1]
            sample_weights[np.argwhere(sample_weights == 0)] = sample_weights[np.argwhere(sample_weights == 0)] + class_weights[0]
        #print(sample_weights)
        for j in range(3):
            randomize = np.arange(i, final) - i
            np.random.shuffle(randomize)
            x_iter = x_iter[randomize]
            y_iter = y_iter[randomize]
            sgd.partial_fit(x_iter, y_iter, classes=[0, 1], sample_weight=sample_weights)

        print("Finished iteration: {}".format(i / iter_size + 1))
        y_pred = sgd.predict(X_test)
        print("Predicted: {}".format(y_pred))
        print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

-- Epoch 1
Norm: 9.29, NNZs: 1899153, Bias: -0.000110, T: 20, Avg. loss: 1070.016670
Total training time: 6.12 seconds.
-- Epoch 1
Norm: 3.13, NNZs: 85867, Bias: -0.000110, T: 20, Avg. loss: 171.422966
Total training time: 3.65 seconds.
-- Epoch 1
Norm: 8.33, NNZs: 939109, Bias: -0.000110, T: 20, Avg. loss: 150.424334
Total training time: 4.62 seconds.
Finished iteration: 1
Predicted: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.75
-- Epoch 1
Norm: 4.81, NNZs: 167096, Bias: -0.000943, T: 20, Avg. loss: 536.468115
Total training time: 4.54 seconds.
-- Epoch 1
Norm: 9.17, NNZs: 2809467, Bias: 0.000307, T: 20, Avg. loss: 614.968095
Total training time: 5.77 seconds.
-- Epoch 1
Norm: 6.12, NNZs: 798153, Bias: 0.000307, T: 20, Avg. loss: 345.799045
Total training time: 4.64 seconds.
Finished iteration: 2
Predicted: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Accuracy: 0.25
-- Epoch 1
Norm: 9.48, NNZs: 2221046, Bias: -0.000602, T: 20, Avg. loss: 420.406240
Total training time: 5.34 seconds.
-- Epoch 

Predicted: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.75
-- Epoch 1
Norm: 2.98, NNZs: 4990068, Bias: 0.001093, T: 2, Avg. loss: 4431.521191
Total training time: 1.08 seconds.
-- Epoch 1
Norm: 2.25, NNZs: 4735822, Bias: 0.001093, T: 2, Avg. loss: 0.000000
Total training time: 0.82 seconds.
-- Epoch 1
Norm: 1.89, NNZs: 4465512, Bias: 0.001093, T: 2, Avg. loss: 0.000000
Total training time: 0.74 seconds.
Finished iteration: 4
Predicted: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.75
-- Epoch 1
Norm: 10.85, NNZs: 2323347, Bias: 0.000891, T: 20, Avg. loss: 680.669200
Total training time: 6.21 seconds.
-- Epoch 1
Norm: 7.12, NNZs: 1666430, Bias: 0.001800, T: 20, Avg. loss: 288.696069
Total training time: 4.52 seconds.
-- Epoch 1
Norm: 9.45, NNZs: 1434854, Bias: 0.002305, T: 20, Avg. loss: 521.036183
Total training time: 6.19 seconds.
Finished iteration: 1
Predicted: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Accuracy: 0.25
-- Epoch 1
Norm: 10.54, NNZs: 2184182, Bias: 0.001591, T: 20, Avg. loss: 

In [None]:
y_pred = sgd.predict(X_test_)
print(accuracy_score(y_test_, y_pred))

In [92]:
printCoefs(sgd)

(14151706, 2.9469509471640825)
(14151619, 2.868372286699221)
(14151629, 2.868372286699221)
(14151726, 2.0566763369488057)
(23878582, 1.6227016884614516)
(23878585, 1.6227016884614516)
(23878615, 1.6227016884614516)
(23878836, 1.6227016884614516)
(23878871, 1.6227016884614516)
(23878883, 1.6227016884614516)
(21912340, 1.5646148404011448)
(2178373, 1.5190400629885692)
(14151717, 1.3543714928740722)
(19386118, 1.3503082831677589)
(24213205, 1.3397960172012353)
(23879345, 1.276916884019239)
(23879353, 1.276916884019239)
(23879356, 1.276916884019239)
(23879419, 1.276916884019239)
(14151761, 1.2566326723135641)
(2449065, 1.24261948987557)
(23878777, 1.2214605943415713)
(17651383, 1.1715374444304052)
(9221423, 1.1535277219285547)
(2449109, 1.1186419806692816)
(23252907, 1.1007233713065847)
(23879637, 1.096717852771617)
(15363986, 1.0827165466301683)
(24213156, 1.081031007236472)
(1199524, 1.069189938409855)
(1199533, 1.069189938409855)
(1199537, 1.069189938409855)
(9221231, 1.0440445738771011