In [None]:
#使用交叉验证测量精度；每个折叠由StratifiedKFold执行分层抽样产生，其所包含的各个类的比例符合整体比例
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds=StratifiedKFold(n_splits=3,random_state=42)
for train_index,test_index in skfolds.split(X_train,Y_train_5):
    clone_clf=clone(sgd_clf)
    X_train_folds=X_train[train_index]
    Y_train_folds=(Y_train_5[train_index])
    X_test_folds=X_train[test_index]
    Y_test_folds=(Y_train_5[test_index])
    clone_clf.fit(X_train_folds,Y_train_folds)
    Y_pred=clone_clf.predict(X_test_folds)
    n_correct=sum(Y_pred==Y_test_folds)
    print(n_correct/len(Y_pred))

In [None]:
#使用cross_val_score()函数来评估SGDClassifier模型，采用K-fold交叉验证法，K-fold=3
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf,X_train,Y_train_5,cv=3,scoring='accuracy')

In [None]:
#然而这组数据的空准确率就高达90%，说明accuracy不适合评估偏斜数据集(skewed dataset)，所以我们要用混淆矩阵，精度、召回率、AUC、ROC、F1分数来评估

In [None]:
#混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_train_5,Y_train_pred)

In [None]:
#精度和召回率
from sklearn.metrics import precision_score,recall_score,precision_recall_curve
print(precision_score(Y_train_5,Y_train_pred))
print(recall_score(Y_train_5,Y_train_pred))

In [None]:
# F1分数
from sklearn.metrics import f1_score
f1_score(Y_train_5,Y_train_pred)

In [None]:
#(1)精度和召回率的权衡，精度增加，召回率下降；精度减小，召回率上升
#(2)要理解这个权衡过程，我们来看看SGDClassifier如何进行分类决策。对于每个实例，它会基于决策函数计算出一个分值，如果该值大于阈值，则将该实例判为正类；
#否则将其判为负类；于是，在增加阈值的情况下，我们可以增加召回率，但降低了精度
#(3)sklearn不允许直接设置阈值，但是可以访问它用于预测的决策分数，调用decision_function
Y_scores=sgd_clf.decision_function([some_digit])
Y_scores
threshold=0 # sklearn默认的阈值是0,可以通过改变阈值提高召回率
Y_some_digit_pred=(Y_scores>threshold)

In [None]:
#要如何确定用什么样的阈值呢？首先，使用cross_val_predict()函数获取训练集中所有实例的分数，但是这次需要它返回的是决策分数而不是预测结果
Y_scores=cross_val_predict(sgd_clf,X_train,Y_train_5,cv=3,method='decision_function')
print(Y_scores)

In [None]:
#有了这些分数，可以使用precision_call_curve()函数来计算可能的阈值的精度和召回率
from sklearn.metrics import precision_recall_curve
precisions,recalls,thresholds=precision_recall_curve(Y_train_5,Y_scores)

In [None]:
#最后，使用matplotlib绘制精度和召回率相对于阈值的函数图
import matplotlib.pyplot as plt
def plot_precision_recall_vs_threshold(precisions,recalls,thresholds):
    plt.plot(thresholds,precisions[:-1],'b--',label='Precision')
    plt.plot(thresholds,recalls[:-1],'g--',label='Recall')
    plt.xlabel('Threshold')
    plt.legend(loc='upper left')
    plt.ylim([0,-1])
plot_precision_recall_vs_threshold(precisions,recalls,thresholds)
plt.show()

In [None]:
Y_train_pred_90=(Y_scores>7000) #使准确率大于90%
precision_score(Y_train_5,Y_train_pred_90)
recall_score(Y_train_5,Y_train_pred_90)

In [None]:
# ROC曲线
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(Y_train_5,Y_scores)
def plot_roc_curve(fpr,tpr,label-None):
    plt.plot(fpr,tpr,linewidth=2,label=label)
    plt.plott([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
plt_roc_curve(fpr,tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_train_5,Y_scores)