# Final Project - SVM Soft Margin Extension - Scikit-learn

In [1]:
import numpy as np
import csv
import math
from numpy import genfromtxt
from sklearn import svm
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from cvxopt import matrix, solvers
import matplotlib.pyplot as plt
%matplotlib inline

## 1. MNIST

### Scikit-learn implementation 

In [2]:
digits=load_digits()
X = digits.data
y = digits.target

# Scale training features
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

In [3]:
# Assign X and y the subset of data that describe the numbers 8 and 9

new_X = []
new_y = []
for i in range(len(X)):
    if y[i] == 8:
        new_X.append(X[i])
        new_y.append(y[i])
    elif y[i] == 9:
        new_X.append(X[i])
        new_y.append(y[i])
new_X = np.array(new_X)
new_y = np.array(new_y)

X = new_X
y = new_y

In [4]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6,random_state=42)


In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(141, 64)
(141,)
(213, 64)
(213,)


In [6]:
y_train.shape

(141,)

In [7]:
digits=load_digits()
X = digits.data
y = digits.target

# Scale training features
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

# Assign X and y the subset of data that describe the numbers 8 and 9

new_X = []
new_y = []
for i in range(len(X)):
    if y[i] == 8:
        new_X.append(X[i])
        new_y.append(y[i])
    elif y[i] == 9:
        new_X.append(X[i])
        new_y.append(y[i])
new_X = np.array(new_X)
new_y = np.array(new_y)

X = new_X
y = new_y

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6,random_state=42)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

y_train.shape

(141, 64)
(141,)
(213, 64)
(213,)


(141,)

In [9]:
clf = svm.SVC(kernel='linear', gamma='auto')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [10]:
y_pred = clf.predict(X_test)

In [11]:
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 98.12206572769952%


### Scikit-learn implementation plus extension

In [12]:
# Using cross validation to determine the best value of C

from sklearn.model_selection import KFold
from statistics import mean

kf = KFold(n_splits=20, shuffle=True)
kf.get_n_splits(X)

# Values of C to test
C = [1e-5, 1e-3, 1e-2, 1, 1.5]

avg_acc = []
# Perform cross validation for each value of C
for i in range(len(C)):
    
    acc = []
    for train, test in kf.split(X):
        clf = svm.SVC(kernel='linear', C=C[i], gamma='auto')
        clf.fit(X[train], y[train])
        y_pred = clf.predict(X_test)
        acc.append(accuracy_score(y_test, y_pred) * 100)
    avg_acc.append(mean(acc))
    acc = []
    print("C = %s" %(C[i]))
    print(avg_acc[i])

C = 1e-05
54.31924882629108
C = 0.001
95.14084507042254
C = 0.01
98.7793427230047
C = 1
99.90610328638498
C = 1.5
99.95305164319248


## 2. Fashion-MNIST

### Scikit-learn implementation 

In [13]:
from keras.datasets import fashion_mnist
((trainX, trainY), (testX, testY)) = fashion_mnist.load_data()

Using TensorFlow backend.


In [14]:
X_train = trainX
y_train = trainY
X_test = testX
y_test = testY

In [15]:
# Assign X_train and y_train the subset of data that describe the labels 0 and 2 (T-shirts and pullovers, respectively)

new_X_train = []
new_y_train = []
for i in range(len(X_train)):
    if y_train[i] == 0:
        new_X_train.append(X_train[i])
        new_y_train.append(y_train[i])
    elif y_train[i] == 2:
        new_X_train.append(X_train[i])
        new_y_train.append(y_train[i])
new_X_train = np.array(new_X_train)
new_y_train = np.array(new_y_train)

X_train = new_X_train
y_train = new_y_train

In [16]:
# Assign X_test and y_test the subset of data that describe the labels 0 and 2 (T-shirts and pullovers, respectively)

new_X_test = []
new_y_test = []
for i in range(len(X_test)):
    if y_test[i] == 0:
        new_X_test.append(X_test[i])
        new_y_test.append(y_test[i])
    elif y_test[i] == 2:
        new_X_test.append(X_test[i])
        new_y_test.append(y_test[i])
new_X_test = np.array(new_X_test)
new_y_test = np.array(new_y_test)

X_test = new_X_test
y_test = new_y_test

In [17]:
X_train = np.array([X_train[i].flatten() for i in range(len(X_train))])
X_test = np.array([X_test[i].flatten() for i in range(len(X_test))])

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(12000, 784)
(12000,)
(2000, 784)
(2000,)


In [18]:
# Downsample the data

# Add y_train back as an additional column to X_train
y_train = y_train.reshape((-1,1))
X_train = np.append(X_train, y_train, axis=1)

# Add y_test back as an additional column to X_test
y_test = y_test.reshape((-1,1))
X_test = np.append(X_test, y_test, axis=1)

# Shuffle the data
np.random.shuffle(X_train)
np.random.shuffle(X_test)

# Slice out only the first 141 from X_train and 213 from X_test
X_train = X_train[0:141]
X_test = X_test[0:213]

# Remove the last columns of X_train and X_test and place them back into y_train and y_test
y_train = X_train[:,-1]
y_test = X_test[:,-1]
X_train = X_train[:,0:X_train.shape[1]-1]
X_test = X_test[:,0:X_test.shape[1]-1]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(141, 784)
(141,)
(213, 784)
(213,)


In [19]:
# Scale the dataset

X_scale = StandardScaler()
X_train = X_scale.fit_transform(X_train) 
X_test = X_scale.fit_transform(X_test) 

In [20]:
clf = svm.SVC(kernel='linear', gamma='auto')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
y_pred = clf.predict(X_test)

In [22]:
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 95.30516431924883%


### Scikit-learn implementation plus extension

In [24]:
# Using cross validation to determine the best value of C

from sklearn.model_selection import KFold
from statistics import mean

# Combine train and test datasets
X = np.append(X_train, X_test, axis=0)
y = np.append(y_train, y_test, axis=0)

# Initiate sklearn kfold
kf = KFold(n_splits=20, shuffle=True)
kf.get_n_splits(X)

# Values of C to test
C = [1e-5, 1e-3, 1e-2, 1, 1.5]

avg_acc = []
# Perform cross validation for each value of C
for i in range(len(C)):
    
    acc = []
    for train, test in kf.split(X):
        clf = svm.SVC(kernel='linear', C=C[i], gamma='auto')
        clf.fit(X[train], y[train])
        y_pred = clf.predict(X_test)
        acc.append(accuracy_score(y_test, y_pred) * 100)
    avg_acc.append(mean(acc))
    acc = []
    print("C = %s" %(C[i]))
    print(avg_acc[i])

C = 1e-05
52.58215962441315
C = 0.001
96.64319248826291
C = 0.01
99.31924882629107
C = 1
99.64788732394366
C = 1.5
99.69483568075117
