# CME 193 - Scientific Python
### Lecture 8 (5/5)
Spring 2016, Stanford University

### Last time
* Building some predictive models

### Today
* We'll learn to recognize handwriting!

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from sklearn import datasets, metrics

In [None]:
# import NIST digits data set (1797 8x8 images)
digits = datasets.load_digits()

In [None]:
plt.figure(figsize=(16, 6))
for i in range(10):
    plt.subplot(1, 10, i + 1)
    plt.imshow(digits['images'][i], cmap='gist_gray_r',interpolation='none')

In [None]:
X = digits['images']

In [None]:
X.shape

In [None]:
y = digits['target']

We'll need to do some data transformation here! How do we make this a regular matrix?

In [None]:
X = X.reshape(X.shape[0], -1)
print X.shape

Last time, we talked about PCA -- let's use a real PCA library!

Let's visualize how handwritten digits look in a lower dimensional space

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
pca.fit(X)

In [None]:
plt.plot(range(X.shape[-1]), pca.explained_variance_ratio_.cumsum(), '-')
plt.xlabel('Number of components retained')
plt.ylabel('% of variance explained')

Let's visualize what PCA does for handwritten digits!

In [None]:
plt.figure(figsize=(10, 10))
for i in range(64):
    plt.subplot(8, 8, i + 1)
    plt.imshow(pca.components_[i].reshape(8, 8), interpolation='none', cmap='gist_gray_r')

What do you notice?

In [None]:
R = pca.transform(X)

In [None]:
cols = ['pc_%s' % i for i in range(R.shape[-1])]

In [None]:
import pandas as pd

In [None]:
pc = pd.DataFrame(R, columns=cols)
pc['digit'] = [str(l) for l in y]

In [None]:
plt.figure(figsize=(10, 10))
colors = ['#1abc9c', '#ff0084', '#3498db', '#9b59b6', 
          '#f1c40f', '#e67e22', '#e74c3c', '#34495e']

for color, (lab, x) in zip(colors, pc.groupby('digit')):
    plt.plot(x.pc_0, x.pc_1, 'o', label=lab, color=color)
plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.legend()
    

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import train_test_split

In [None]:
model = LogisticRegressionCV()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print 'accuracy at {}%'.format(100 * (yhat == y_test).mean())

In [None]:
pc.drop('digit', axis=1, inplace=True)

In [None]:
P = pc.values

In [None]:
!pip install tqdm

In [None]:
import tqdm

In [None]:
acc = []
for i in tqdm.tqdm(xrange(P.shape[-1])):
    X_train, X_test, y_train, y_test = train_test_split(P[:, :(i + 1)], y, test_size=0.5)
    model = LogisticRegressionCV()
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)
    acc.append(100 * (yhat == y_test).mean())

In [None]:
plt.plot(range(P.shape[-1]), acc, '-')
plt.xlabel('Number of components retained')
plt.ylabel('Predictive Accuracy')