## Подключение библиотек

In [None]:
import pandas as pd
import numpy as np
import os

%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt

np.random.seed=42
mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)

### Загрузка данных

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version = 1, cache = True)
mnist.target = mnist.target.astype(np.int8)

In [None]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target,i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target,i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]
    
sort_by_target(mnist)

### Просмотр данных

In [None]:
mnist['data'], mnist['target']

In [None]:
X, y = mnist['data'], mnist['target']
print(X.shape)
print(y.shape)

In [None]:
some_digit = X[36000]
some_digit_image = some_digit.reshape(28,28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary, interpolation='nearest')
plt.axis('off')
plt.show()

In [None]:
y[36000]

In [None]:
def plot_digit(data):
    image=data.reshape(28,28)
    plt.imshow(image, cmap=mpl.cm.binary, interpolation='nearest')
    plt.axis(False)

In [None]:
def plot_digits(instances,images_per_row=10, **options):
    size=28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap=mpl.cm.binary, **options)
    plt.axis('off')

In [None]:
plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
plot_digits(example_images, images_per_row = 10)
plt.show()

### Создание тестового и испытательного набора

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

### Обучение Двоичного классификатора

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=2)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

## Оценка производительности

### Перекрестная проверка

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct/len(y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')

In [None]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring='accuracy')

### Матрица неточностей

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
y_train_pred

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

### Точность и полнота

In [None]:
from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

### Соотношение точность/полнота

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
threshold = 7000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method='decision_function')

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision', linewidth=2)
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall', linewidth=2)
    plt.xlabel('Threshold')
    plt.legend(loc='center left',fontsize=16)
    plt.ylim([0,1])
    plt.xlim([-45000,45000])
    
plt.figure(figsize=(10,5))
plot_precision_recall_vs_threshold(precision, recalls, thresholds)
plt.show()

In [None]:
# Показывает, что для всех распознанных 5-ок количество очков > 0
# а для всех остальных цифр, распознанных как не 5 - количество очков отрицательное.
(y_train_pred == (y_scores > 0)).all()

In [None]:
y_train_pred_90 = (y_scores > 1000)
print(precision_score(y_train_5, y_train_pred_90))
print(recall_score(y_train_5, y_train_pred_90))

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, 'b-', linewidth=2)
    plt.xlabel('Recalls', fontsize=16)
    plt.ylabel('Precisions', fontsize=16)
    plt.axis([0, 1, 0, 1])
    
plt.figure(figsize=(8,6))
plot_precision_vs_recall(precision, recalls)
plt.show()

## Кривая ROC

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    
plt.figure(figsize=(8,6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=2)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method='predict_proba')

In [None]:
y_probas_forest

In [None]:
y_scores_forest = y_probas_forest[:, 1]
y_scores_forest

In [None]:
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, 'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right', fontsize=20)
plt.axis([0, 1, 0, 1])
plt.show()

In [None]:
roc_auc_score(y_train_5, y_scores_forest)

In [None]:
y_train_pred_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3)
y_train_pred_forest

In [None]:
print(precision_score(y_train_5, y_train_pred_forest))
print(recall_score(y_train_5, y_train_pred_forest))

In [None]:
cross_val_score(forest_clf, X_train, y_train_5, cv=3, scoring='accuracy')

In [None]:
forest_clf.fit(X_train, y_train_5)
forest_clf.score(X_train, y_train_5)

## Многоклассовая классифокация

In [None]:
# Классифаер

sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

In [None]:
some_digits_scores = sgd_clf.decision_function([some_digit])
some_digits_scores

In [None]:
np.argmax(some_digits_scores)

In [None]:
sgd_clf.classes_

In [None]:
sgd_clf.classes_[5]

In [None]:
from sklearn.multiclass import OneVsOneClassifier

ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=2))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

In [None]:
len(ovo_clf.estimators_)

In [None]:
# Случайные Леса

forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

In [None]:
print(forest_clf.base_estimator_,
      forest_clf.classes_,
      forest_clf.n_features_,
      forest_clf.n_outputs_, sep='\n')

In [None]:
print(forest_clf.get_params(),
      forest_clf.predict_proba([some_digit]),
      forest_clf.score(X_train, y_train), sep='\n\n')

In [None]:
# Оценочки

cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy', verbose=6)

In [None]:
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring='accuracy', verbose=6)

In [None]:
# Масштабирование

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
#cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')

In [None]:
cross_val_score(forest_clf, X_train_scaled, y_train, cv=3, scoring='accuracy',verbose=6)

## Анализ ошибок

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_m = confusion_matrix(y_train, y_train_pred)
conf_m

In [None]:
def plot_confusion_matrix(matrix):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)
    
plot_confusion_matrix(conf_m)

In [None]:
plt.matshow(conf_m, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_m.sum(axis=1, keepdims=True)
norm_conf_mx = conf_m / row_sums

In [None]:
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
plot_confusion_matrix(norm_conf_mx)

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

## Многозначная классификация

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, verbose=6)
f1_score(y_multilabel, y_train_knn_pred, average='macro')

## Многовыходовая классификация

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

In [None]:
some_index=5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
plt.show()

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)

## Best Clasifier

In [None]:
from sklearn.model_selection import GridSearchCV

knn_clf = KNeighborsClassifier()
params = [
          {'n_neighbors' : [3, 4, 5],
          'weights' : ['uniform', 'distance']},
]

grid_search = GridSearchCV(knn_clf, params, scoring='accuracy', cv=3, verbose=8)
#grid_search.fit(X_train_scaled, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
final_model = grid_search.best_estimator_
final_model

In [None]:
grid_search.best_score_

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

In [None]:
# Не скалированный набор
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
final_model = grid_search.best_estimator_
final_model

In [None]:
grid_search.best_score_

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

In [None]:
from sklearn.externals import joblib

my_model = grid_search.best_estimator_
joblib.dump(my_model,'my_model.pkl')

In [None]:
# Рандомные леса

rnd_forest = RandomForestClassifier(random_state=2)

f_params=[
    {'n_estimators':[80, 110, 140, 170, 200, 250, 300, 350],
     'criterion':['gini', 'entropy'],
     'bootstrap':[True, False],
     'class_weight':['balanced', 'balanced_subsample', None]}
]

grid_search = GridSearchCV(rnd_forest, f_params, scoring='accuracy', cv=3, verbose=8)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
grid_search.best_score_

In [None]:
final_model = grid_search.best_estimator_
final_model

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

### Оценка на испытательном наборе

In [None]:
final_model = joblib.load('my_model.pkl')
f_predictions = final_model.predict(X_test)
n_correct = sum(f_predictions == y_test)
print(n_correct / len(y_test))

## Функция преподготовки

In [None]:
from scipy.ndimage.interpolation import shift

In [None]:
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [None]:
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [None]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [None]:
rnd_clf = RandomForestClassifier(**grid_search_best_params_)

In [None]:
rnd_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)