# Recognizing Handwritten Digits with scikit-learn

### Load Dataset

In [32]:
from sklearn import datasets
digits = datasets.load_digits()

### Full description of the dataset

In [33]:
print(digits.DESCR)

The numerical values represented by images, i.e., the targets, are contained in the 
digit.targets array.

In [34]:
digits.target

### Shape of the dataset

In [35]:
digits.data.shape

### Images stored in the form of array

The images of the handwritten digits are contained in a digits.images array. Each 
element of this array is an image that is represented by an 8x8 matrix of numerical values 
that correspond to a grayscale from white, with a value of 0, to black, with the value 15

In [36]:
digits.images[0]

The images of the handwritten digits are contained in a digits.images array

### Visualizing an array

In [37]:
import matplotlib.pyplot as plt
plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Visualizing an array')   
# save the figure
plt.savefig('plot2.png', dpi=100, bbox_inches='tight')

### Visualization of digits

In [38]:
import numpy as np 
plt.figure(figsize=(15,4))
plt.subplots_adjust(hspace=0.8)
images_and_labels = list(zip(digits.images, digits.target))

for index, (image, label) in enumerate(images_and_labels[:10]):
    plt.subplot(2, 5, index + 1)
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label, fontsize =12)
# save the figure
plt.savefig('plot1.png', dpi=300, bbox_inches='tight')

### Flatten the input images

In [39]:
n = len(digits.images)
print(n)
data = digits.images.reshape((n, -1))

### Split the dataset

### Size of the training set
It was reported that the dataset is a training set consisting of 1,797 images. we can 
determine if that is true.

In [40]:
digits.target.size

In [41]:
# flatten the images
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

### Test cases:
We test the hypothesis by using these cases, each case for a different range of training and validation sets
#### case 1: Here we have split the data by assigning 0.01 as test size.

In [42]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.01, random_state=0)

### Training and Prediction

### Support Vector Classifier

In [43]:
from sklearn import svm
svc = svm.SVC(gamma=0.001, C=100.)

In [44]:
svc.fit(x_train, y_train)

In [45]:
y_pred = svc.predict(x_test)
y_pred

#### test samples and their predicted digit value

In [46]:
images_and_predictions = list(zip(x_test,y_pred))

plt.figure(figsize=(18,5))
for index, (image, prediction) in enumerate(images_and_predictions[:19]):
    plt.subplot(2, 9, index + 1)
    image = image.reshape(8, 8)
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

# save the figure
plt.savefig('plot3.png', dpi=300, bbox_inches='tight')

### Evaluation 

In [47]:
score = svc.score(x_test, y_test)
print('Accuracy Score: {0}'.format(score))

In [48]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sn

data = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'

plt.figure(figsize = (8,8))
sn.set(font_scale=1.4)#for label size
plt.title('Confusion Matrix')
sn.heatmap(df_cm, annot=True,annot_kws={"size": 12})# font size
plt.savefig('plot4.png', dpi=100, bbox_inches='tight')

In [49]:
from sklearn.metrics import classification_report
cr=classification_report(y_test, y_pred)
print("Classification report for SVM classifier:\n\n",cr)

#### case 2: Here we have split the data by assigning 0.7 as test size.

In [50]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.7, random_state=0)
from sklearn import svm
svc = svm.SVC(gamma=0.001, C=100.)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [51]:
score = svc.score(x_test, y_test)
print('Accuracy Score: {0}'.format(score))

#### case 3: Here we have split the data by assigning 0.9 as test size.

In [52]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.9, random_state=0)
from sklearn import svm
svc = svm.SVC(gamma=0.001, C=100.)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [53]:
score = svc.score(x_test, y_test)
print('Accuracy Score: {0}'.format(score))

**Conclusion:**

This dataset predicts the digit accurately 95% of the times.