In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
from plotutils import plot_data, plot_surface
from SVM import kernel_svm_train, kernel_svm_predict
from sklearn import preprocessing
from crossval import cross_validate

In [2]:
# Read the data using pandas
df = pd.read_csv('digit-recognition/digit-recognition/train.csv')
df2 = pd.read_csv('digit-recognition/digit-recognition/test.csv')

#turn the training dataframe into numpy array 
label_data_train = df.to_numpy()

#column vector for the labels (38000,)
label_train = label_data_train[:,0] 

# (38000 x 784 matrix for the training data )
data_train = label_data_train[:, 1:] 
print(label_train.shape)
print(data_train.shape)

# Turn testing dataframe into numpy array 
label_data_test = df2.to_numpy()
label_test = label_data_test[:,0]
data_test = label_data_test[:,1:]

masked_train_labels = np.full(label_train.shape, -1)
masked_test_labels = np.full(label_test.shape, -1)

(38000,)
(38000, 784)


In [3]:
c_vals = 10 ** np.linspace(-3, 1, 5)
sigmas = np.linspace(0.1, 1.5, 15)
orders = [2, 3, 4, 5]
norms = ['l1', 'l2', 'max']
num_folds = 4
sets = len(np.unique(label_train))
test_accuracy = np.zeros(sets)

In [17]:
# Run rbf kernel with no cross validation (random value selection)
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    best_params = {}
    best_params['kernel'] = 'rbf'
    best_params['C'] = 1
    best_params['sigma'] = .1

    split_train_data = np.split(data_train, 50)[0]
    split_train_labels = np.split(masked_train_labels, 50)[0]
    
    split_train_data_normalized = preprocessing.normalize(split_train_data, norm='l2')

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1}".format(i, test_accuracy[i]))
    
print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.9
Test accuracy for 1: 0.8885
Test accuracy for 2: 0.8925
Test accuracy for 3: 0.9075
Test accuracy for 4: 0.90275
Test accuracy for 5: 0.90525
Test accuracy for 6: 0.89675
Test accuracy for 7: 0.89775
Test accuracy for 8: 0.90425
Test accuracy for 9: 0.90475
Overall accuracy: 0.9


In [18]:
# Run polynomial kernel with no cross validation (random value selection)
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    best_params = {}
    best_params['kernel'] = 'polynomial'
    best_params['C'] = .2
    best_params['order'] = 5

    split_train_data = np.split(data_train, 50)[0]
    split_train_labels = np.split(masked_train_labels, 50)[0]
    
    split_train_data_normalized = preprocessing.normalize(split_train_data, norm='l1')

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1}".format(i, test_accuracy[i]))
    
print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.94325
Test accuracy for 1: 0.6005
Test accuracy for 2: 0.97325
Test accuracy for 3: 0.929
Test accuracy for 4: 0.7355
Test accuracy for 5: 0.8505
Test accuracy for 6: 0.9025
Test accuracy for 7: 0.81075
Test accuracy for 8: 0.95425
Test accuracy for 9: 0.73225
Overall accuracy: 0.8431750000000001


In [19]:
# Run Linear kernel with no cross validation (random value selection)
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    best_params = {}
    best_params['kernel'] = 'linear'
    best_params['C'] = 1

    split_train_data = np.split(data_train, 50)[0]
    split_train_labels = np.split(masked_train_labels, 50)[0]
    
    split_train_data_normalized = preprocessing.normalize(split_train_data, norm='l1')

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1}".format(i, test_accuracy[i]))
    
print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.8735
Test accuracy for 1: 0.47175
Test accuracy for 2: 0.83975
Test accuracy for 3: 0.68575
Test accuracy for 4: 0.61125
Test accuracy for 5: 0.68275
Test accuracy for 6: 0.65
Test accuracy for 7: 0.77975
Test accuracy for 8: 0.42025
Test accuracy for 9: 0.2545
Overall accuracy: 0.6269250000000001


In [20]:
# Run rbf with CV to determine best values
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    split_train_data = np.split(data_train, 100)[0]
    split_train_labels = np.split(masked_train_labels, 100)[0]
        
    best_params = []
    best_score = 0
    
    for j in range(len(c_vals)):
        for k in range(len(orders)):
            for l in range(len(norms)):

                params = {
                    'kernel': 'rbf',
                    'C': c_vals[j],
                    'sigma': sigmas[k],
                    'norm': norms[l]
                }

                split_train_data_normalized = preprocessing.normalize(split_train_data, params['norm'])

                cv_score, _ = cross_validate(kernel_svm_train, kernel_svm_predict, split_train_data_normalized.T, split_train_labels, num_folds, params)
                if cv_score > best_score:
                    best_score = cv_score
                    best_params = params
               
    split_train_data_normalized = preprocessing.normalize(split_train_data, best_params['norm'])

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1} (with C={2}, sigma={3}, norm={4})".format(i, test_accuracy[i], best_params['C'], best_params['sigma'], best_params['norm']))

print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.9 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 1: 0.8885 (with C=10.0, sigma=0.4, norm=l2)
Test accuracy for 2: 0.8925 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 3: 0.9075 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 4: 0.90275 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 5: 0.90525 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 6: 0.89675 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 7: 0.10225 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 8: 0.90425 (with C=10.0, sigma=0.1, norm=l1)
Test accuracy for 9: 0.90475 (with C=10.0, sigma=0.4, norm=l2)
Overall accuracy: 0.8204500000000001


In [10]:
# Run polynomial with CV to determine best values
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    split_train_data = np.split(data_train, 100)[0]
    split_train_labels = np.split(masked_train_labels, 100)[0]
        
    best_params = []
    best_score = 0
    
    for j in range(len(c_vals)):
        for k in range(len(orders)):
            for l in range(len(norms)):

                params = {
                    'kernel': 'polynomial',
                    'C': c_vals[j],
                    'order': orders[k],
                    'norm': norms[l]
                }

                split_train_data_normalized = preprocessing.normalize(split_train_data, params['norm'])

                cv_score, _ = cross_validate(kernel_svm_train, kernel_svm_predict, split_train_data_normalized.T, split_train_labels, num_folds, params)
                if cv_score > best_score:
                    best_score = cv_score
                    best_params = params
               
    split_train_data_normalized = preprocessing.normalize(split_train_data, best_params['norm'])

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1} (with C={2}, order={3}, norm={4})".format(i, test_accuracy[i], best_params['C'], best_params['order'], best_params['norm']))

print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.84025 (with C=0.1, order=5, norm=l2)
Test accuracy for 1: 0.927 (with C=0.1, order=4, norm=l2)
Test accuracy for 2: 0.95775 (with C=0.1, order=5, norm=l2)
Test accuracy for 3: 0.69225 (with C=1.0, order=2, norm=l2)
Test accuracy for 4: 0.92125 (with C=0.1, order=5, norm=l2)
Test accuracy for 5: 0.94825 (with C=0.001, order=2, norm=max)
Test accuracy for 6: 0.89125 (with C=1.0, order=3, norm=l2)
Test accuracy for 7: 0.966 (with C=0.1, order=5, norm=l2)
Test accuracy for 8: 0.40975 (with C=1.0, order=3, norm=l2)
Test accuracy for 9: 0.6345 (with C=0.1, order=5, norm=l2)
Overall accuracy: 0.818825


In [12]:
from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_train)

# Run rbf kernel with no cross validation (random value selection)
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    best_params = {}
    best_params['kernel'] = 'poly'
    best_params['C'] = 1
    best_params['sigma'] = .1

    split_train_data = np.split(X_scaled, 50)[0]
    split_train_labels = np.split(masked_train_labels, 50)[0]
    
#     split_train_data_normalized = preprocessing.normalize(split_train_data, norm='l2')

    model = kernel_svm_train(split_train_data.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    test_accuracy[i] = np.mean(predictions[0] == masked_test_labels)
    print("Test accuracy for {0}: {1}".format(i, test_accuracy[i]))
    
print("Overall accuracy: {0}".format(np.mean(test_accuracy)))

Test accuracy for 0: 0.9
Test accuracy for 1: 0.8885
Test accuracy for 2: 0.8925
Test accuracy for 3: 0.9075
Test accuracy for 4: 0.90275
Test accuracy for 5: 0.90525
Test accuracy for 6: 0.89675
Test accuracy for 7: 0.89775
Test accuracy for 8: 0.90425
Test accuracy for 9: 0.90475
Overall accuracy: 0.9


In [14]:
# Run polynomial kernel with no cross validation (random value selection)
scores = np.zeros((sets,len(label_test)))
for i in range(sets):
    train_indices = np.where(label_train==i)[0]
    test_indices = np.where(label_test==i)[0]

    masked_train_labels = np.full(label_train.shape, -1)
    masked_train_labels[train_indices] = 1

    masked_test_labels = np.full(label_test.shape, -1)
    masked_test_labels[test_indices] = 1

    best_params = {}
    best_params['kernel'] = 'polynomial'
    best_params['C'] = .2
    best_params['order'] = 5

    split_train_data = np.split(data_train, 50)[0]
    split_train_labels = np.split(masked_train_labels, 50)[0]
    
    split_train_data_normalized = preprocessing.normalize(split_train_data, norm='l1')

    model = kernel_svm_train(split_train_data_normalized.T, split_train_labels, best_params)
    predictions = kernel_svm_predict(data_test.T, model)
    scores[i] = predictions[1]

label_choice = np.argmax(scores, axis=0)

test_accuracy = np.mean(label_choice == label_test)
print("Test accuracy: {0}".format(test_accuracy))

scores shape: (10, 4000)
Test accuracy: 0.74025
