In [None]:
# 2020-10-22 created by Akson

In [None]:
# Code3.1
# 加载MNIST数据集

from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version = 1)

In [None]:
# Code3.2
# 获取数据集与标签集

X = mnist['data']
y = mnist['target']
print(X.shape)
print(y.shape)

In [None]:
# Code3.3
# 显示成图片

import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap = 'binary')
plt.axis('off')
plt.show()

In [None]:
# Code3.4
# 获取用于训练和测试的训练集，训练集标签，测试集，测试集标签

import numpy as np

# 将y的数据类型转为整数
y = y.astype(np.uint8)

X_train = X[: 60000]
X_test = X[60000 :]
y_train = y[: 60000]
y_test = y[60000 :]

In [None]:
# Code3.5
# 先尝试二分类问题，以5为例

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
# Code3.6
# 使用随机梯度下降

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_5)

sgd_clf.predict([some_digit])

In [None]:
# Code3.7
# 使用k-折交叉验证

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring = 'accuracy')

In [None]:
# Code3.8
# 混淆矩阵（1）

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3)

In [None]:
# Code3.9
# 混淆矩阵（2）

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)

In [None]:
# Code3.10
# 精度

from sklearn.metrics import precision_score

precision_score(y_train_5, y_train_pred)

In [None]:
# Code3.11
# 召回率

from sklearn.metrics import recall_score

recall_score(y_train_5, y_train_pred)

In [None]:
# Code3.12
# F1

from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

In [None]:
# Code3.13
# 根据阈值预测

y_scores = sgd_clf.decision_function([some_digit])
# print(y_scores)
threshold = 0.0
y_some_digit_pred = (y_scores > threshold)
# print(y_some_digit_pred)
threshold = 8000
y_some_digit_pred = (y_scores > threshold)
# print(y_some_digit_pred)

y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3, method = 'decision_function')
print(y_scores)

In [None]:
# Code3.14
# 计算所有可能阈值的召回率

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
# Code3.15
# 画图

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[: -1], 'b--', label = 'precision')
    plt.plot(thresholds, recalls[: -1], 'g-', label = 'recall')
    plt.show()
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

In [None]:
# Code3.16
# 给定精度求阈值

threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

y_train_pred_90 = (y_scores >= threshold_90_precision)

print(precision_score(y_train_5, y_train_pred_90))
print(recall_score(y_train_5, y_train_pred_90))

In [None]:
# Code3.17
# ROC, AUC

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth = 2, label = label)
    plt.plot([0, 1], [0, 1], 'k--')

plot_roc_curve(fpr, tpr)

plt.show()

roc_auc_score(y_train_5, y_scores)

In [None]:
# Code3.18
# 使用随机森林

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()
y_predict_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv = 3)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv = 3, method = 'predict_proba')

In [None]:
# Code3.19
# 对随机森林进行评价

y_scores_forest = y_probas_forest[:, 1]
fpr_forest, tpr_forest, thresholds = roc_curve(y_train_5, y_scores_forest)

plt.plot(fpr, tpr, 'b:', label = 'SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc = 'lower right')
plt.show()

print(roc_auc_score(y_train_5, y_scores_forest))

print(precision_score(y_train_5, y_predict_forest))
print(recall_score(y_train_5, y_predict_forest))

In [None]:
# Code3.20
# 使用SVC进行多分类

from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)

In [None]:
# Code3.21
# 对SVC结果检验

svm_clf.predict([some_digit])
some_digit_scores = svm_clf.decision_function([some_digit])

print(some_digit_scores)
print(np.argmax(some_digit_scores))
print(svm_clf.classes_)
print(svm_clf.classes_[5])


In [None]:
# Code3.22
# 强制使用一对一或一对剩余策略

from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SVC())
ovr_clf.fit(X_train, y_train)
print(ovr_clf.predict([some_digit]))
print(len(ovr_clf.estimators_))


In [None]:
# Code3.23
# 训练SGD并测试

sgd_clf.fit(X_train, y_train)
print(sgd_clf.predict([some_digit]))

# 输出这个分类器对所有标签的得分输出
print(sgd_clf.decision_function([some_digit]))

# 交叉验证
print(cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = 'accuracy'))

In [None]:
# Code3.24
# 对数据进行适当的标准化在测试

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
print(cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = 'accuracy'))

In [None]:
# Code3.25
# 做出混淆矩阵

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv = 3)
conf_mx = confusion_matrix(y_train, y_train_pred)
print(conf_mx)
plt.matshow(conf_mx, cmap = plt.cm.gray)
plt.show()

In [None]:
# Code3.26
# 使用错误率而不是绝对的错误数量

row_sums = conf_mx.sum(axis = 1, keepdims = True)
norm_conf_mx = conf_mx / row_sums

# 再用0填充对角线
np.fill_diagonal(norm_conf_mx, 0)
print(norm_conf_mx)
plt.matshow(norm_conf_mx, cmap = plt.cm.gray)
plt.show()

In [None]:
# Code3.27
# 多标签分类

from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
print(y_mutilabel.shape)

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)


In [None]:
# Code3.28
# 对KNN结果评估

print(knn_clf.predict([some_digit]))
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_mutilabel, cv = 3)
print(fi_score(y_mutilabel, y_train_knn_pred, average = 'macro'))

In [None]:
# Code3.29
# 对图像增加噪声

noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(x_test), 784))
X_test_mod = X_test + noise


In [None]:
# Code3.30
# 显示噪声图片

noise_digit = X_train_mod[0]
noise_digit_image = noise_digit.reshape(28, 28)

plt.imshow(noise_digit_image, cmap = 'binary')
plt.axis('off')
plt.show()

In [None]:
# Code3.31
# 显示无噪声图片

commen_digit = X_train[0]
commen_digit_image = commen_digit.reshape(28, 28)

plt.imshow(commen_digit_image, cmap = 'binary')
plt.axis('off')
plt.show()

In [None]:
# Code3.32
# 清洗照片

knn.clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[0]])
print(clean_digit)