## 分类模型

In [2]:
from sklearn import metrics

y_pred = [0, 0, 0, 1, 1, 1]
y_true = [0, 1, 0, 1, 1, 0]

print(metrics.confusion_matrix(y_true, y_pred))
print('准确率：', metrics.accuracy_score(y_true, y_pred))

print('类别精度:', metrics.precision_score(y_true, y_pred, average=None))

print('宏平均精度:', metrics.precision_score(y_true, y_pred, average="macro"))
print('微平均召回率:', metrics.recall_score(y_true, y_pred, average='micro'))
print('加权平均F1:', metrics.f1_score(y_true, y_pred, average='weighted'))

[[2 1]
 [1 2]]
准确率： 0.6666666666666666
类别精度: [0.66666667 0.66666667]
宏平均精度: 0.6666666666666666
微平均召回率: 0.6666666666666666
加权平均F1: 0.6666666666666666


## 回归模型

In [3]:
from sklearn.metrics import explained_variance_score, r2_score
y_pred = [3, -0.3, 2.2, 1.3]
y_true = [2.3, 0, 2.0, 1.0]
print('explained_variance_score', explained_variance_score(y_true, y_pred))
print('r2', r2_score(y_true, y_pred))

explained_variance_score 0.844682478959449
r2 0.7827084927314459


## 虚拟估计器产生基准得分

In [1]:
# 创建一个不平衡的数据
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
y[y != 1] = -1
X_train, X_test, y_train, y_test = train_~test_split(X, y, random_state=0)


In [2]:
y

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [3]:
#比较线性svm和虚拟估计器的得分
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=1).fit(X_train, y_train)
print('liner svc classifier score:', svc.score(X_test, y_test))

dummy = DummyClassifier(strategy='most_frequent', random_state=0)
dummy.fit(X_train, y_train)
print('Dummy classifier score:', dummy.score(X_test, y_test))

liner svc classifier score: 0.631578947368421
Dummy classifier score: 0.5789473684210527


In [4]:
svcrbf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
print('rbf svc classifier score:', svcrbf.score(X_test, y_test))

rbf svc classifier score: 0.9736842105263158




## 过拟合与交叉验证

In [5]:
#随机排列交叉yanzheng
import numpy as np
from sklearn.model_selection import ShuffleSplit
X = np.arange(5)
ss = ShuffleSplit(n_splits=4,test_size=0.4, random_state=0)
for train_index, test_index in ss.split(X):
    print('%s, %s' % (train_index, test_index))

[1 3 4], [2 0]
[1 4 3], [0 2]
[4 0 2], [1 3]
[2 4 0], [3 1]


In [7]:
# 分层K折交叉验证,用于数据不平衡
from sklearn.model_selection import StratifiedKFold
X = np.arange(10)
y = [0,0,0,0,1,1,1,1,1,1]
skf = StratifiedKFold(n_splits=3, shuffle=False)
for train_index, test_index in skf.split(X, y):
    print('%s, %s' % (train_index, test_index))

[2 3 6 7 8 9], [0 1 4 5]
[0 1 3 4 5 8 9], [2 6 7]
[0 1 2 4 5 6 7], [3 8 9]


In [10]:
#留P分组
from sklearn.model_selection import LeavePGroupsOut
X = np.arange(6)
y = [1, 1, 1,  2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]
lpgo = LeavePGroupsOut(n_groups=2)
for train_index, test_index in lpgo.split(X, y, groups):
    print('%s, %s' % (train_index, test_index))

[4 5], [0 1 2 3]
[2 3], [0 1 4 5]
[0 1], [2 3 4 5]


In [11]:
#时间序列的分割
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1,2],[3,4],[1,2],[3,4],[1,2],[3,4],[2,2],[4,6]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
tscv = TimeSeriesSplit(n_splits=3, max_train_size=3)
for train_index, test_index in tscv.split(X, y):
    print('%s, %s' % (train_index, test_index))

[0 1], [2 3]
[1 2 3], [4 5]
[3 4 5], [6 7]


## 交叉验证的综合评分

In [16]:
from sklearn.model_selection import cross_val_score

svc2 = SVC(kernel='linear', C=1)
scores = cross_val_score(svc2, iris.data, iris.target, cv=5)
print(scores)

print("Accuracy: %0.2f(+/-%0.2f)" %(scores.mean(), scores.std()*2))

[0.73333333 0.83333333 0.56666667 0.53333333 0.66666667]
Accuracy: 0.67(+/-0.22)


In [18]:
#指定打分方式
from sklearn import metrics
scores = cross_val_score(svc2, iris.data, iris.target, cv=5, scoring='f1_macro')
scores

array([0.58333333, 0.77777778, 0.49935815, 0.52488688, 0.6031746 ])

In [19]:
#指定交叉验证迭代器，
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
ss = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
cross_val_score(svc2, iris.data, iris.target, cv=ss)

array([0.64444444, 0.6       , 0.68888889])

In [21]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
clf = SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, iris.data, iris.target,
                        scoring=['f1_micro', 'f1_macro'],
                        cv=10, return_train_score=False)
print(sorted(scores.keys()))
print(scores['fit_time'])
print(scores['score_time'])
print('f1_micro:', scores['test_f1_micro'])
print('f1_macro:', scores['test_f1_macro'])

['fit_time', 'score_time', 'test_f1_macro', 'test_f1_micro']
[0.00052357 0.00070524 0.00041914 0.00039625 0.00045466 0.0004797
 0.000633   0.0005784  0.0004375  0.00043941]
[0.00075221 0.00062609 0.00066018 0.00144672 0.00065017 0.00109005
 0.00102019 0.00068069 0.0006566  0.00084209]
f1_micro: [0.73333333 0.73333333 0.8        0.86666667 0.73333333 0.4
 0.66666667 0.46666667 0.73333333 0.66666667]
f1_macro: [0.58333333 0.58333333 0.72049689 0.82954545 0.65909091 0.38914027
 0.66063348 0.44444444 0.72222222 0.53416149]
