In [1]:
import numpy as np
from sklearn import svm
from math import floor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import warnings
np.random.seed(1)

In [2]:
# Load data from EMNIST
def loadEmnistFromNPY(filename):
    try:
        ret = np.load(filename)
    except FileNotFoundError:
        zipRef = zipfile.ZipFile('../data/EMNIST/balanced-data.zip')
        zipRef.extractall('../data/EMNIST')
        zipRef.close()
        ret = np.load(filename)

    return ret

Only use a small subset of the data rather than the entire dataset. This is done for performance reasons because SVM takes a very long time to fit.

In [3]:
# Read in EMNIST train data from .npy
EMTrainData = loadEmnistFromNPY('../data/EMNIST/balanced-train-data.npy')
EMTrainLabels = loadEmnistFromNPY('../data/EMNIST/balanced-train-labels.npy')

# Read in EMNIST test data from .npy
EMTestData = loadEmnistFromNPY('../data/EMNIST/balanced-test-data.npy')
EMTestLabels = loadEmnistFromNPY('../data/EMNIST/balanced-test-labels.npy')

# Downsample the data randomly
percentTrainSample = .01
percentTestSample = .1

nrowsInTrainData = len(EMTrainData)
nrowsInTestData = len(EMTestData)

trainIndicies = np.random.choice(nrowsInTrainData, floor(percentTrainSample * nrowsInTrainData))
testIndicies = np.random.choice(nrowsInTestData, floor(percentTestSample * nrowsInTestData))

trainData = EMTrainData[trainIndicies]
trainLabels = EMTrainLabels[trainIndicies]
testData = EMTestData[testIndicies]
testLabels = EMTestLabels[testIndicies]

Because the RBF kernel uses a distance metric in the calculation of its infinite dimensional projection of the data, the scaling of features is very important. 
$$exp(−\gamma||x−x′||^2)$$
If one feature varies from 1 to 1000, and another only varies between 1 and 2, the first feature can begin to dominate the calculation of the hyperplane. Although the data is inherently scaled from 1-255, this does not represent a true scaling of each feature (pixel). Some pixels only vary from 1-n where n is less than 255, and keeping it on the 255 scale makes this feature less significant. A min-max scaling is used to scale each feature from 0 to 1. 

In [4]:
with warnings.catch_warnings():  # Silence a data conversion warning that is thrown from scaling.
    warnings.simplefilter("ignore")    
    scaler = MinMaxScaler()
    scaler.fit(trainData)
    trainData = scaler.transform(trainData)
    testData = scaler.transform(testData)

Tune the model using cross-validation based on accuracy measure. Test a list of params for C and Gamma.

In [5]:
tuningParameters = [{'kernel': ['rbf'], 
                    'gamma': [.001, .01, .1, 1, 10],
                     'C': [.001, .01, .1, 1, 10]}]
test = GridSearchCV(svm.SVC(), tuningParameters, cv=3, scoring='accuracy')
test.fit(trainData, trainLabels)

means = test.cv_results_['mean_test_score']
for mean, params in zip(means, test.cv_results_['params']):
    print("Accuracy: %0.3f for %r" % (mean, params))

Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.001, 'gamma': 0.001}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.001, 'gamma': 0.01}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.001, 'gamma': 0.1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.001, 'gamma': 1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.001, 'gamma': 10}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.01, 'gamma': 0.001}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.01, 'gamma': 0.01}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.01, 'gamma': 0.1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.01, 'gamma': 1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.01, 'gamma': 10}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.001}
Accuracy: 0.038 for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.1, 'gamma': 1}
Accuracy: 0.036 for {'kernel': 'rbf', 'C': 0.1, 'gamma': 10}
Accuracy: 0.161 for {'kernel': 'rbf', 'C': 1, 'gamma': 

The highest average cross validation accuracy was found to be 57.9% with gamma=.01 and C=10