In [1]:
import sklearn 
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from sklearn import datasets

In [2]:
digits = datasets.load_digits()

In [3]:
digits['data']

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [4]:
digits['target']

array([0, 1, 2, ..., 8, 9, 8])

In [5]:
X_digits = digits['data']
Y_digits = digits['target']

X_digits_train = X_digits[:-100]
Y_digits_train = Y_digits[:-100]
X_digits_test = X_digits[-100:]
Y_digits_test = Y_digits[-100:]

In [6]:
from sklearn import svm

svc = svm.SVC(C=1, kernel='linear')
svc.fit(X_digits_train, Y_digits_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [7]:
svc.score(X_digits_test, Y_digits_test)

0.98

### Cross-validation from scratch

In [8]:
np.random.seed(0)
indices = np.random.permutation(X_digits.shape[0])
X_digits = X_digits[indices]
Y_digits = Y_digits[indices]

X_folds = np.array_split(X_digits, 3)
Y_folds = np.array_split(Y_digits, 3)

In [9]:
scores = list()
for k in range(3):
    x_train = list(X_folds)
    y_train = list(Y_folds)
    
    x_test = x_train.pop(k)
    x_train = np.concatenate(x_train)
    y_test = y_train.pop(k)
    y_train = np.concatenate(y_train)
    
    svc.fit(x_train, y_train)
    scores.append(svc.score(x_test, y_test))
print(scores)

[0.9716193656093489, 0.9833055091819699, 0.986644407345576]


## Cross-validation generator

In [10]:
from sklearn.model_selection import KFold, cross_val_score
x = ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'c', 'c']
kfolds = KFold(n_splits=5)
for train_index, test_index in kfolds.split(x):
    print("Train: {0} | Test: {1}".format(train_index, test_index))

Train: [2 3 4 5 6 7 8 9] | Test: [0 1]
Train: [0 1 4 5 6 7 8 9] | Test: [2 3]
Train: [0 1 2 3 6 7 8 9] | Test: [4 5]
Train: [0 1 2 3 4 5 8 9] | Test: [6 7]
Train: [0 1 2 3 4 5 6 7] | Test: [8 9]


In [11]:
scores = [svc
          .fit(X_digits[train_indices], Y_digits[train_indices]) 
          .score(X_digits[test_indices], Y_digits[test_indices])
          for train_indices, test_indices in KFold(n_splits=5).split(X_digits)]

In [12]:
scores

[0.9777777777777777,
 0.9833333333333333,
 0.9888579387186629,
 0.9888579387186629,
 0.9832869080779945]

In [13]:
cross_val_score(estimator=svc, cv=5, X = X_digits, y = Y_digits)

array([0.97802198, 0.97790055, 0.99164345, 0.98319328, 0.98309859])

## Use cross-validation to tune parameters

In [14]:
def cross_validation_scores(svc):
    scores = [svc
              .fit(X_digits[train_indices], Y_digits[train_indices])
              .score(X_digits[test_indices], Y_digits[test_indices])
              for train_indices, test_indices in KFold(n_splits=5).split(X_digits)]
    return np.mean(scores)

In [15]:
C_s = np.logspace(-10, 0, 10)
cv_scores1 = [cross_validation_scores(svm.SVC(C=C, kernel='linear')) for C in C_s]

In [16]:
print("Best param: C= {0}".format(C_s[np.argmax(cv_scores1)]))

Best param: C= 0.07742636826811278
