In [None]:
# setup state 
import matplotlib.pyplot as plt
from sklearn import datasets
# our support vector machine classifier!
from sklearn import svm

import matplotlib
import numpy

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

# load digits dataset
digits = datasets.load_digits()

#let's ignore python warnings - not a good idea, but simplifies visualization....
import warnings
warnings.filterwarnings("ignore")

In [None]:
# let's see what this dataset is all about
print digits.DESCR

# The data set contains images of hand-written digits: 10 classes where
# each class refers to a digit.

# 32x32 bitmaps are divided into nonoverlapping blocks of
# 4x4 and the number of on pixels are counted in each block. This generates
# an input matrix of 8x8 where each element is an integer in the range
# 0..16.

In [None]:
# let's see what keys this digits object has
print digits.keys()

In [None]:
print len( digits.target )
print digits.target[:25] # they seem to be in order 

numpy.histogram( digits.target, bins=10 )[0] # each number is pretty evenly distributed in the dataset 

In [None]:
# Let's see what classes we want to put digits into
print digits.target_names

In [None]:
# Let's pic one and see what it looks like
first_digit = digits.data[1300]
print first_digit

In [None]:
# But what do all of those columns mean!?!?!

# let's see what the digit 1300 was tagged as
print digits.target[1300]

In [None]:
# now let's see what digits.images[1300] is
# it's just the first_digit array reshaped as an 8 x 8 matrix
print digits.images[1300]

In [None]:
# show the image
plt.imshow(
    digits.images[1300],
    cmap          = plt.cm.gray_r,
    interpolation = "nearest"
)
plt.show()

In [None]:
# how many data points do we have?
print len( digits.data ) # 1797 data points 

In [None]:
# let's create a training set by leaving out the last 1500 of the 1797 data points
leftout = 1500
x_training, y_training = digits.data[:-leftout], digits.target[:-leftout]

In [None]:
# Let's define a function to check accuracy for a given classifier on validation data

def check_accuracy_validation( classifier, dataset, lower_bound ):
    correct = 0
    indices = range( lower_bound, 0 )
    for i in indices:
        # if we were correct
        if classifier.predict( dataset.data[i].reshape(1, -1) )[0] == dataset.target[i]:
            correct += 1
    accuracy = float( correct ) / len( indices )
    return accuracy

# Let's define a function to check accuracy for a given classifier on training data
def check_accuracy_training( classifier, dataset, lower_bound ):
    correct = 0
    num_samples = dataset.data.shape[0]
    indices = range( 0, num_samples+lower_bound )
    for i in indices:
        # if we were correct
        if classifier.predict( dataset.data[i].reshape(1, -1) )[0] == dataset.target[i]:
            correct += 1
    accuracy = float( correct ) / len( indices )
    return accuracy

Scikit-learn SVM uses a parameter $C$ that "weights" the error term instead of a parameter $\lambda$ that weights the $\ell_2$ regularization terms, so you can think of $C \approx \frac{1}{\lambda}$.

In [None]:
# let's instantiate our Support Vector Classifier with linear kernel
classifier = svm.SVC( C=1, kernel='linear' )
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier.fit( x_training, y_training )

print "Prediction {}".format( classifier.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier, digits, -leftout )
print "Linear SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier, digits, -leftout )
print "Linear SVC Accuracy on Validation Set: {}".format( accuracy )

In [None]:
# let's instantiate our Support Vector Classifier with linear kernel
classifier2 = svm.SVC( gamma=1, C=1, kernel='poly',degree=10 )
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier2.fit( x_training, y_training )

print "Prediction {}".format( classifier2.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier2, digits, -leftout )
print "poly SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier2, digits, -leftout )
print "poly SVC Accuracy on Validation Set: {}".format( accuracy )

In [None]:
# let's instantiate our Support Vector Classifier with linear kernel
classifier3 = svm.SVC( gamma=1, C=1, kernel='sigmoid')
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier3.fit( x_training, y_training )

print "Prediction {}".format( classifier3.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier3, digits, -leftout )
print "sigmoid SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier3, digits, -leftout )
print "sigmoid SVC Accuracy on Validation Set: {}".format( accuracy )

In [None]:
# let's instantiate our Support Vector Classifier with linear kernel
classifier4 = svm.SVC( gamma=1, C=100, kernel='rbf')
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier4.fit( x_training, y_training )

print "Prediction {}".format( classifier4.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier4, digits, -leftout )
print "rbf SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier4, digits, -leftout )
print "rbf SVC Accuracy on Validation Set: {}".format( accuracy )

In [None]:
# Let's try with Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit( x_training, y_training )

# Let's see what the accuracy is
# Let's go from digits.data[-200] all the way to digits.data[-1]
accuracy = check_accuracy_training( log_reg, digits, -leftout )
print "Logistic Regression Accuracy on Training Set: {}".format( accuracy )
accuracy = check_accuracy_validation( log_reg, digits, -leftout )
print "Logistic Regression Accuracy on Validation Set: {}".format( accuracy )

### What happens to the accuracy when you change gamma in rbf the SVM to .01? .1? 1? almost 0? And C to be 1? 10? 100? 1000?

In [None]:
for gamma in [0.0001, 0.001, 0.01, 0.1, 1]:
    for C in [1, 10, 100, 1000]:
        classifier = svm.SVC( gamma=gamma, C=C, kernel='rbf' )
        classifier.fit( x_training, y_training )
        accuracy = check_accuracy_training( classifier, digits, -leftout )
        print "Accuracy on training set for gamma = %f, C = %d: \t\t%f" % ( gamma, C, accuracy )
        accuracy = check_accuracy_validation( classifier, digits, -leftout )
        print "Accuracy on validation set for gamma = %f, C = %d: \t\t%f" % ( gamma, C, accuracy )

# accuracy decreases as gamma increases; the "best" value seems to be around 0.0001 to 0.001
# accuracy increases a little as C increases, though not by much and not always (note: C cannot be 0!)

### What happens to the accuracy when you train the sigmoid SVM or the rbf SVM on many more samples? Let's say all but the last 300

In [None]:
leftout = 300

#sigmoid kernel

# let's instantiate our Support Vector Classifier with linear kernel
classifier3 = svm.SVC( gamma=1, C=1, kernel='sigmoid')
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier3.fit( x_training, y_training )

print "Prediction {}".format( classifier3.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier3, digits, -leftout )
print "sigmoid SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier3, digits, -leftout )
print "sigmoid SVC Accuracy on Validation Set: {}".format( accuracy )

#RBF kernel

# let's instantiate our Support Vector Classifier with linear kernel
classifier4 = svm.SVC( gamma=1, C=100, kernel='rbf')
# C is penalty parameter of the error term

# now let's train the classifier on the training data
classifier4.fit( x_training, y_training )

print "Prediction {}".format( classifier4.predict( digits.data[1300].reshape(1, -1) ) )
print digits.target[1300] # it got it right 

accuracy = check_accuracy_training( classifier4, digits, -leftout )
print "rbf SVC Accuracy on Training Set: {}".format( accuracy ) 

accuracy = check_accuracy_validation( classifier4, digits, -leftout )
print "rbf SVC Accuracy on Validation Set: {}".format( accuracy )