## MNIST
* Handwritten digit recognition

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [2]:
from sklearn.datasets import fetch_openml
# mnist = fetch_openml("MNIST original")
mnist = fetch_openml('mnist_784', version=1)


KeyboardInterrupt: 

In [None]:
X,y = mnist["data"],mnist["target"]
X.shape, y.shape

In [None]:
a = np.array([1,2,3,4,5,6,7,8,9]).reshape(3,3)
b = np.linspace(1,100,9)
plt.imshow(a) ##(row,column) is a axis and color is actual value

In [None]:
some_digit = X[36000]
some_digit_image = some_digit.reshape(28,28)

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = X[:6000],X[6000:],y[:6000],y[6000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X[shuffle_index], y[shuffle_index]

In [None]:
y_train_5 = (y_train=="5") ##true (for all 5s) or false(others)
y_test_5=(y_test=="5")

## Stochastic Gradient Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

sgd_clf.predict([some_digit])

In [None]:
## Another way of implementing cross val
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3)

for train_idx,test_idx in skfolds.split(X_train,y_train_5):
    clone_clf = clone(sgd_clf)
    x_train_folds = X_train[train_idx]
    y_train_folds = y_train_5[train_idx]
    x_test_fold = X_train[test_idx]
    y_test_fold = y_train_5[test_idx]
    clone_clf.fit(x_train_folds, y_train_folds)
    y_pred = clone_clf.predict(x_test_fold)
    n_correct = sum(y_pred==y_test_fold)
    print(n_correct/len(y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3,scoring="accuracy")

In [None]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X),1),dtype=bool)
    

In [None]:
never_5 = Never5Classifier()
# p = never_5.predict(X_test)
cross_val_score(never_5, X_train,y_train_5,
                cv=3,scoring="accuracy")
##90% accuracy just because 90% of data are non 5
##so it'd much bettter to look at comfusion matrix

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)


In [None]:
y_train_pred.size,y_train_5.size

In [None]:
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(y_train_5,y_train_pred)

##column represents predicted class(negative,positive)
##first row negative class(actual) 
##second row positive class(actual)
##negative == non5
##(1,1) = correctly classified as non-5 (true negative)
##(1,2) = wrongly classified as 5 (false positive)
##(2,1) = wrongly classified as non-5 (false negative)
##(2,2) = correctly classified as 5 (true positive)
##A perfect classifier would have only true positive and treu negative

In [None]:
confusion_matrix(y_train_5, y_train_5)

### Precision = TP/(TP+FP)    
##### TP+FP = number of instaneces predicted positive

### Recall=Sensitivity=True Positive Rate = TP/(TP+FN)
##### TP+FN = number of instances that are actually positive

In [None]:
from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5,y_train_pred),c_m[1,1]/sum(c_m[:,1]))

print(recall_score(y_train_5, y_train_pred), c_m[1,1]/sum(c_m[1,:]))

### F1 score = 2/(1/precision+1/recall) = harmonic mean(weighted average)
###### The smaller (worse) vlaue would have more weights 

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred), f1_score(y_train_5, y_train_5)

**rejecting many good ones to keep only good ones (high precision and low recall)** 

e.g. cartoons that seem appropriate for kids  

**detecting with low threshold to detect actual positive ones (high recall and low precision)**

e.g. medical test


lowing the threshold increases recall but decreases precision(negative prediction)

（確実にTrueをとらえるためTrueと判断する基準をあまくする。）

higher threshold increases precision but decreases recall(positive prediction)

（確実にTrue以外を除外するためTrueと判断する基準を厳しくする。）



In [None]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores
##set your own threshold
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
##The SGDClassifier uses a threshold equal to 0

#### How to decide which threshold we use

In [None]:
##method = "decision_function" will return decision score
y_scores = cross_val_predict(sgd_clf,X_train,y_train_5, cv=3,
                            method="decision_function")

In [None]:
##calculate precision_recall_curve
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")#blue dash line
    plt.plot(thresholds, recalls[:-1],"g-",label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0,1])
    
    
plot_precision_recall_vs_threshold(precisions,recalls, thresholds)
plt.show()
    

In [None]:
plt.plot(precisions,recalls)
plt.xlabel("precisions")
plt.ylabel("recalls")

In [None]:
y_train_pred_90 = (y_scores > 4000)

In [None]:
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

## The ROC Curve
##### receiver operating characteristic

Plot recall=True Positive Rate(TP/actual P) against 

False Positive Rate(FP/actual N)=1-True Negative Rate(TN/actual N)

**True Negative Rate** is called *Speificity*
**True Positive Rate(Recall)** is called *Sensitivity* 

In [None]:
##first compute the TPR and FPR for various threshold vals
from sklearn.metrics import roc_curve

fpr,tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def plot_roc_curve(fpr,tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],"k--") ##gray dash
    plt.axis([0,1,0,1])##xmin,xmax,ymin,ymax
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    
plot_roc_curve(fpr,tpr)
plt.show()

In [None]:
## to compare classifiers,measuring the area under the curve 
##which would be 1.0 for a perfect classifier and would be 0.5
## for a purely random classifiler

from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

Whenever the positive class is rare, or when you care more about the
false positives than the false negative (want high precision and 
accept some false negative)->**use precision/recall curve**

Otherwise, **roc curve**

e.g. the roc curve above looks good, but it is because there are few positives(5) compared to the negatives.
but the PR curve shows that there are some room for improvement

## RandomForest to check ROC

(RandomForestClassifier does not have a decision_function() method)

Instead, it has dict_proba() method that returns the probabilities for
being in each class.

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                   method="predict_proba")
y_probas_forest
##first column = P(X!=5)
##second column = P(X=5)

In [None]:
y_score_forest = y_probas_forest[:,1]##pick P(X=5)
fpr_forest, tpr_forest, thresholds = roc_curve(y_train_5,y_score_forest)

In [None]:
plt.plot(fpr,tpr, "b:",label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="best")
plt.show()

In [None]:
roc_auc_score(y_train_5, y_score_forest)

## use my own threshold
y_train_pred = y_score_forest > 0.3
precision_score(y_train_5,y_train_pred)

In [None]:
recall_score(y_train_5, y_train_pred)

Note: 
1. when using roc_curve and roc_auc_score, use array of scores/probabilities

2. when using recall_score and precision_score, use array of predicted classes

3. convert array of score to array of classes, filter by thresholds, e.g. ary_c=ary_s>100

## Multiclass Classification

In [None]:
#This automatically detects multiclass and implement one-vs-all classification 
some_digit_scores = sgd_clf.fit(X_train, y_train)

In [None]:
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

In [None]:
sgd_clf.predict([some_digit])

In [None]:
np.argmax(some_digit_scores)

In [None]:
sgd_clf.classes_

* To specify one-vs-one or one-vs-all, use OneVsOneClassifier or OneVsRestClassifier

In [None]:
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

In [None]:
len(ovo_clf.estimators_)

In [None]:
##Training RandomForestClassifier is just as easy
##Decision Tree can directly classify instances into multiple classes
forest_clf.fit(X_train,y_train)
forest_clf.predict([some_digit])

In [None]:
forest_clf.predict_proba([some_digit])

In [None]:
print(len(forest_clf.estimators_))

In [None]:
cross_val_score(sgd_clf,X_train,y_train, cv=3, scoring="accuracy")

In [None]:
##scaling could improve the accuracy
##standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf,X_train_scaled,y_train, cv=3, scoring="accuracy")

In [None]:
y_train_pred = cross_val_predict(sgd_clf,X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
##add each columns -> (i,1)+(i,2)+...+(i,n) [ith row]
##== actual number of each digit (row) 
#columns are predicted classes
##e.g. (1,5) = actual class is 1, predicted class is 5
row_sums = conf_mx.sum(axis=1, keepdims=True)

##ratio of missed class to actual class
##e.g. (1,5) is percentage that actual class 1 is misclassified as 5
norm_conf_mx = conf_mx/row_sums

In [None]:
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

See images that were classifiled appropriately and mislassifed

In [None]:
def plot_digits(mtrx, images_per_row=5):

    for i in range(len(mtrx)):
        plt.subplot()
        

In [None]:
cl_a, cl_b = "3","5"
X_aa = X_train[(y_train==cl_a) & (y_train_pred==cl_a)] ##correctly classiflied a
X_ab = X_train[(y_train==cl_a) & (y_train_pred==cl_b)] ##a classified as b
X_ba = X_train[(y_train==cl_b) & (y_train_pred==cl_a)] ##b classified as a
X_bb = X_train[(y_train==cl_b) & (y_train_pred==cl_b)] ##correctly classified b

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

The results look sensitive to rotation and shifting 

so to preprocess the images to es

In [None]:
X_aa = X_train[(y_train==cl_a) & (y_train_pred==cl_a)] ##correctly classiflied a
len(X_aa[:25])

In [None]:
import matplotlib as mpl
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    print(image.shape)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
row_img=[]
aa = X_aa[:25]
aa1 = [a.reshape(28,28) for a in aa[0:5]]
# row_img.append(np.concatenate(aa1, axis=1)
a = np.concatenate(aa1,axis=1)
for i in range(5):
    row_img.append(a)
image = np.concatenate(a, axis=0)
print(np.array(aa1).shape)
np.concatenate(np.array(aa1),axis=1).shape

In [None]:
a = np.arange(0,3920).reshape(5,28,28)
b = np.concatenate(a, axis=0)
c = np.concatenate(a, axis=1)
a.shape

In [None]:
c.shape

In [None]:
5*4*3*2
d = np.arange(0,120).reshape(5,4,3,2)
np.concatenate(d,axis=2).shape

## Multilabel Classification

output would be a tuple

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= "7")
y_train_odd = (np.int64(y_train) % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd] ##vertical + vertical
y_multilabel ## first col == 7,8,9 second col == odd

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

## metrics to evaluate multilabel classifier
e.g. compute f1 score for each label then simply compute the average score

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
f1_score(y_train, y_train_knn_pred, average="macro") ##average macro== equally likely
##average weighted == the num instances with that target label will be more weighted

## Multioutput Classification
output would be a tuple with non-binary classes

In [None]:
##add random noise
noise1 = rnd.randint(0,100,(len(X_train),784))
noise2 = rnd.randint(0,100,(len(X_test), 784))

##X is original + noise
X_train_mod = X_train+noise1
X_test_mod = X_test+noise2

##y (matrix) is original
y_train_mod = X_train
y_test_mod = X_test

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod][some_index])
plot_digit(clean_digit)

### Numba (nothing to do with this chapter)

In [23]:
from numba import jit
import math

@jit(nopython=True)
def hypot(x, y):
    # Implementation from https://en.wikipedia.org/wiki/Hypot
    x = abs(x);
    y = abs(y);
    t = min(x, y);
    x = max(x, y);
    t = t / x;
    return x * math.sqrt(1+t*t)

In [24]:
hypot(3.0, 4.0)

5.0

In [25]:
hypot.py_func(3.0,4.0)

5.0

In [26]:
%timeit hypot.py_func(3.0,4.0)

2.47 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [30]:
%timeit hypot(3.0,4.0)

571 ns ± 59.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [28]:
# hypot.inspect_types()

In [40]:
from numba import vectorize

@vectorize(['int64(int64, int64)'], target='cuda')
def add_ufunc(x, y):
    return x + y
a = np.arange(0,10, dtype=np.int64)
b = np.arange(0,10, dtype=np.int64)

In [41]:
print('a+b:\n', add_ufunc(a, b))

CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:

In [43]:
import spark

ModuleNotFoundError: No module named 'spark'