In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split

import numpy as np 
import pickle

import time
n_components_iteration = [2, 5, 25, 48, 50, 75, 100]

In [2]:
def build_pca(n_components, X_train):
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    return pca

In [3]:
def transform_and_normalize(data, clf):
    return normalize(clf.transform(data))

In [4]:
def build_svm(data, labels):
    clf = svm.LinearSVR(random_state=0, tol=1e-5)
    clf.fit(data, labels)
    return clf

In [5]:
def build_svc(data, labels):
    clf = svm.SVC(gamma='scale')
    clf.fit(data, labels)
    return clf

In [6]:
def test_svm(clf, test_data):
    y_pred = clf.predict(test_data)
    y_pred_bin = [0 if i < 0.5 else 1 for i in y_pred]
    return y_pred_bin

In [7]:
def save_all(svc=False):
    with open('./pca_data/pca_classifiers'+('_svc' if svc else '')+'.pickle', 'wb') as handle:
        pickle.dump(pca_classifiers, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('./pca_data/svm_classifiers'+('_svc' if svc else '')+'.pickle', 'wb') as handle:
        pickle.dump(svm_classifiers, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('./pca_data/X_train_transformed_normalized'+('_svc' if svc else '')+'.pickle', 'wb') as handle:
        pickle.dump(X_train_transformed_normalized, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('./pca_data/X_test_transformed_normalized'+('_svc' if svc else '')+'.pickle', 'wb') as handle:
        pickle.dump(X_test_transformed_normalized, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [10]:
X_train = np.load('./data/X_train.npy')
X_test = np.load('./data/X_test.npy')
with open("train_labels.txt", "rb") as fp:   # Unpickling
    train_labels = pickle.load(fp)
with open("test_labels.txt", "rb") as fp:   # Unpickling
    test_labels = pickle.load(fp)
X_train, X_cv, train_labels, cv_labels = train_test_split(X_train, train_labels, train_size=50000)
X_test, X_cv, test_labels, cv_labels = train_test_split(X_test, test_labels, train_size=10000)



In [11]:
pca_classifiers = {}
svm_classifiers = {}
X_train_transformed_normalized = {}
X_test_transformed_normalized = {}
for i in n_components_iteration:
    start = time.time()
    print(i)
    
    pca_clf = build_pca(i, X_train)
    pca_time = time.time()
    print("PCA fit time for "+str(i)+" components: "+str(pca_time-start))
    
    X_train_trans_norm = transform_and_normalize(X_train, pca_clf)
    X_test_trans_norm = transform_and_normalize(X_test, pca_clf)
    trans_norm_time = time.time()
    print("Transform and Normalization time for "+str(i)+" components: "+str(trans_norm_time-pca_time))

    
    svm_clf = build_svc(X_train_trans_norm, train_labels)
    svm_time = time.time()
    print("SVM time for "+str(i)+" components: "+str(svm_time-trans_norm_time))

    
    X_train_transformed_normalized[str(i)] = X_train_trans_norm
    X_test_transformed_normalized[str(i)] = X_test_trans_norm
    pca_classifiers[str(i)] = pca_clf
    svm_classifiers[str(i)] = svm_clf
    save_time = time.time()
    print("Save time for "+str(i)+" components: "+str(save_time-svm_time))

    
    y_pred_bin = test_svm(svm_clf, X_test_trans_norm)
    test_time = time.time()
    print("Test time for "+str(i)+" components: "+str(test_time-save_time))
    
    print("--- Accuracy for "+str(i)+" components: " + str(metrics.accuracy_score(test_labels, y_pred_bin)))
    end = time.time()
    print("time for "+str(i)+" components: "+str(end-start))
    print("=============")
# save_all(svc=True)

2
PCA fit time for 2 components: 0.23015260696411133
Transform and Normalization time for 2 components: 0.01818251609802246
SVM time for 2 components: 129.164320230484
Save time for 2 components: 0.00018095970153808594
Test time for 2 components: 13.841394186019897
--- Accuracy for 2 components: 0.5193
time for 2 components: 143.26092839241028
5
PCA fit time for 5 components: 0.23808932304382324
Transform and Normalization time for 5 components: 0.01944112777709961
SVM time for 5 components: 120.08865857124329
Save time for 5 components: 0.000179290771484375
Test time for 5 components: 15.445564985275269
--- Accuracy for 5 components: 0.5287
time for 5 components: 135.79828524589539
25
PCA fit time for 25 components: 0.3604600429534912
Transform and Normalization time for 25 components: 0.028342485427856445
SVM time for 25 components: 233.298485994339
Save time for 25 components: 0.00018024444580078125
Test time for 25 components: 28.547765731811523
--- Accuracy for 25 components: 0.52

In [12]:
save_all(svc=True)