## 1. A Quick Solution Using SKlearn

In [0]:
# import numpy to create X and y arrays
import numpy as np

# use sklearn's embedded datasets
from sklearn import datasets

# pick a supervised learning algorithm
from sklearn import svm

# let's try the hand-written digits dataset first
digits = datasets.load_digits()

In [0]:
# explore the dataset 

print(digits.data.shape) #1797 samples of 8x8 images, 2D array
print(type(digits.data)) #imported dataset is stored as ndarray
print(digits.data.dtype) #all the data is in float64
print(np.amax(digits.data))  #values range from 0-16
print(digits.data) 


In [0]:
# observe y - target

print(digits.target)

In [0]:
# define a learning model

clf = svm.SVC()

In [0]:
# predefine paprameters, use tab key to see all options or visit the sklearn page 
clf = svm.SVC(gamma=0.001, C=100)

In [0]:
# define X, y
X,y = digits.data[:-10], digits.target[:-10]

In [0]:
# make sure dimensions match
print(X.shape)
print(y.shape)


In [0]:
# train the model using model.fit(X,y)
clf.fit(X,y)

In [49]:
# test the result using model.predict 
# print(clf.predict(digits.data[-5]))
# Read the error message and explain why
print(clf.predict(digits.data[-5].reshape(1,-1)))

[9]


In [0]:
# let's see if the predicted digit is correct 
print("The Target is : ", digits.target[-5])

# show the actual image 
import matplotlib.pyplot as plt
plt.imshow(digits.images[-5], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

## 2. Understand Scikit-learn user manual and examples 
https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#sphx-glr-auto-examples-classification-plot-digits-classification-py

## 3. Let's try a larger dataset and other supervised learning algorithms

In [0]:
import tensorflow as tf

#mnist is embedded into keras
mnist = tf.keras.datasets.mnist


#the embedded mnist dataset has "load_data()" function
(x_train, y_train),(x_test, y_test) = mnist.load_data()

#scale the values to [0,1]
x_train, x_test = x_train / 255.0, x_test / 255.0

In [0]:
# observe the data
some_digit = x_train[1]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=plt.cm.gray_r,
           interpolation="nearest")
#plt.axis("off")

plt.show()
print(y_train[1]) # see the label

In [0]:
x_train.shape

### Try random forest first

In [0]:
#n_estimator is the number of trees 
#n_estimator = 400 --> 221.16 seconds and 0.9712
#              100 -->54.81 seconds and 0.9704    default value in 0.22
#              10 --> 5.47 seconds and 0.9492   default value in 0.2

# train a rfc model

from sklearn.ensemble import RandomForestClassifier
import time


model_rfc = RandomForestClassifier(n_estimators=10
                                , random_state=42)
t0 = time.time()
model_rfc.fit(x_train.reshape(60000,784), y_train)
t1 = time.time()
print("Training Used %5.2f seconds!" % (t1-t0))

In [0]:
# predict using the train the model

y_pred = model_rfc.predict(x_test.reshape(10000,784))
from sklearn.metrics import accuracy_score


# use sklearn to evaluate the model
import sklearn as sk
print(accuracy_score(y_test, y_pred))
print()
print("Classification report for classifier %s:\n%s\n"
      % (model_rfc, sk.metrics.classification_report(y_test, y_pred)))
print("Confusion matrix:\n%s" % sk.metrics.confusion_matrix(y_test, y_pred))

### How about stochastic gradient descent? 

In [0]:
from sklearn.linear_model import SGDClassifier

model_sgd = SGDClassifier(random_state=42)
t0 = time.time()
model_sgd.fit(x_train.reshape(60000,784), y_train)
t1 = time.time()
print("Training Used %5.2f seconds!" % (t1-t0))

In [0]:
y_pred = model_sgd.predict(x_test.reshape(10000,784))

accuracy_score(y_test, y_pred)

### Now, we need to consider working on different parameters and really understanding the algorithms. Scikit-learn has modules to help search optimal parameters 

In [0]:
# lots of parameters to tune. Kernel and other parameters affects training time
# need dimension reduction methods such as PCA to reduce the number of features
# https://scikit-learn.org/stable/modules/svm.html 

#first 20000 samples
#Used 132.22 seconds! 93.3 seconds with GPU
#0.9695

#all 60000 samples
#Used 702.90 seconds!  567 seconds with GPU
#0.9792
from sklearn import svm
model_svm = svm.SVC(gamma='scale', random_state=42)
t0 = time.time()
model_svm.fit(x_train.reshape(60000,784), y_train)
t1 = time.time()
print("Used %5.2f seconds!" % (t1-t0))
y_pred = model_svm.predict(x_test.reshape(10000,784))

accuracy_score(y_test, y_pred)

Used 540.07 seconds!


0.9792

### Explore the sklearn library and try decision tree model? 

In [0]:
##############################################################
##############################################################
## write your code here ######################################



#############################################################
#############################################################


t0 = time.time()
model_tree.fit(x_train.reshape(60000,784), y_train)
t1 = time.time()
print("Used %5.2f seconds!" % (t1-t0))
y_pred = model_tree.predict(x_test.reshape(10000,784))

accuracy_score(y_test, y_pred)

Used 33.44 seconds!


0.8754

In [0]:
#show the graphic tree
#not feasible on image dataset 
#see the example of Iris dataset
#https://scikit-learn.org/stable/modules/tree.html

import graphviz 
dot_data = tree.export_graphviz(model_tree, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("Tree") 

'Tree.pdf'

### Support vector machine is supposed to give us better result

In [0]:
# Don't try svm in class!!! It takes about 10 minutes to reach 97.92% accuracy 

from sklearn import svm
model_svm = svm.SVC(gamma='scale', random_state=42)
t0 = time.time()
model_svm.fit(x_train.reshape(60000,784), y_train)
t1 = time.time()
print("Used %5.2f seconds!" % (t1-t0))
y_pred = model_svm.predict(x_test.reshape(10000,784))
accuracy_score(y_test, y_pred)

In [0]:
#keep 30 components/features
#32.28 seconds  0.9756

# use principal component analysis or singular value decomposition
from sklearn.decomposition import PCA,TruncatedSVD
svd = TruncatedSVD(n_components = 30)
x_train_pca = svd.fit(x_train.reshape(60000,784)).transform(x_train.reshape(60000,784))
x_test_pca = svd.transform(x_test.reshape(10000,784))

In [57]:
from sklearn import svm
model_svm = svm.SVC(gamma='scale', random_state=42)
t0 = time.time()
model_svm.fit(x_train_pca, y_train)
t1 = time.time()
print("Used %5.2f seconds!" % (t1-t0))
y_pred = model_svm.predict(x_test_pca)
accuracy_score(y_test, y_pred)

Used 31.28 seconds!


0.976

In [59]:
x_train_pca.shape
svd.explained_variance_

array([3.06213767, 4.34791984, 3.74094939, 3.22480228, 2.78546566,
       2.27726443, 1.91210805, 1.53469841, 1.51782657, 1.24591104,
       1.12037884, 1.0810482 , 0.90480583, 0.89910428, 0.83856622,
       0.78521532, 0.71915702, 0.68299594, 0.62713176, 0.60777048,
       0.56214268, 0.53087105, 0.50321474, 0.48212516, 0.46604742,
       0.44151449, 0.42917215, 0.41410764, 0.39150516, 0.36284962])