# Classifier scoring and cross validation

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [None]:
digits = load_digits()
X = digits.data
y = digits.target
clf = SVC(C=1.0, kernel="linear")
for i in range(5):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y)
    clf.fit(xtrain, ytrain)
    print("Iteration %d" % (i + 1), "Accuracy: %f" % clf.score(xtest, ytest))

## Q: What is the true accuracy?
## Introducing cross validation

![](cv.jpg)

### CV Iteration 1: Samples 1-1200 in training, 1200 in testing

In [None]:
xtrain = X[:1200, :]
xtest = X[1200:, :]
ytrain = y[:1200]
ytest = y[1200:]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

### CV Iteration 2: Samples 601 onwards in traning, 1-600 in testing

In [None]:
xtrain = X[600:, :]
xtest = X[:600, :]
ytrain = y[600:]
ytest = y[:600]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

### CV Iteration 3: Samples 1-600 and 1201 onwards in training, 601-1200 in testing

In [None]:
xtrain = np.r_[X[:600, :], X[1200:, :]]
xtest = X[600:1200, :]
ytrain = np.r_[y[:600], y[1200:]]
ytest = y[600:1200]
clf.fit(xtrain, ytrain)
print(clf.score(xtest, ytest))

## KFold cross validation

In [None]:
from sklearn.cross_validation import KFold
kfold = KFold(n=X.shape[0], n_folds=6, shuffle=True)

In [None]:
for train_index, test_index in kfold:
    xtrain = X[train_index, :]
    ytrain = y[train_index]
    xtest = X[test_index, :]
    ytest = y[test_index]
    clf.fit(xtrain, ytrain)
    print(clf.score(xtest, ytest))

## Exercise: Try KFold cross validation on the following dataset:

In [None]:
_x1 = np.random.multivariate_normal(mean=[0, 0], cov=np.array([[0, 0.5], [0.5, 0]]), size=(900,))
_x2 = np.random.multivariate_normal(mean=[0.75, 0.75], cov=np.array([[0, 0.125], [0.125, 0]]), size=(100,))
X = np.r_[_x1, _x2]
y = np.zeros((X.shape[0],))
y[900:] = 1
rand_ix = np.arange(1000)
X = X[rand_ix, :]
y = y[rand_ix]

In [None]:
# enter code here

### Q: What can we do to reduce variation in scores?

In [None]:
skf = StratifiedKFold(y, n_folds=6, shuffle=True)
for train_index, test_index in skf:
    xtrain = X[train_index, :]
    ytrain = y[train_index]
    xtest = X[test_index, :]
    ytest = y[test_index]
    clf.fit(xtrain, ytrain)
    print(clf.score(xtest, ytest))

## Putting it all together

In [None]:
from sklearn.cross_validation import cross_val_score
acc = cross_val_score(clf, X, y, cv=StratifiedKFold(y, n_folds=6))
print(acc.mean())

## Using Cross Validation to select hyperparameters

In [None]:
X = digits.data
y = digits.target

In [None]:
clf = SVC(kernel="linear")
cross_val_score(clf, X, y, cv=StratifiedKFold(y, n_folds=6))

In [None]:
Cs = np.logspace(-10, 10, 20)
accuracies = []
for C in Cs:
    clf = SVC(C=C, kernel="linear")
    acc = cross_val_score(clf, X, y)
    accuracies.append(acc.mean())
plt.figure(figsize=(8, 6))
plt.semilogx(Cs, accuracies)
plt.xlabel("$\\frac{1}{\lambda}$")
plt.ylabel("Mean score")

In [None]:
cross_val_score?

## Exercise: Find the optimal regularization parameter for `LogisticRegression` on the breast cancer dataset

In [None]:
bc = load_breast_cancer()
X = bc.data
y = bc.target

In [None]:
# enter code here

# Automating Hyperparameter Selection

In [None]:
LogisticRegression?

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
grid = {'C': Cs}
print(grid)

In [None]:
clf = LogisticRegression()
gcv = GridSearchCV(clf, param_grid=grid, n_jobs=-1)

In [None]:
gcv.fit(X, y)

In [None]:
gcv.best_estimator_

In [None]:
gcv.best_params_

In [None]:
gcv.best_score_

## Exercise: Find best parameters for SVC for digits dataset using the following grid:

In [None]:
X = digits.data
y = digits.target
Cs = np.logspace(-4, 4, 10)
grid = {"C": Cs, "kernel": ['linear', 'poly', 'linear']}

In [None]:
# enter code here