In [1]:
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

mpl.rcParams['legend.numpoints']=1

In [2]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

x,y=make_blobs(random_state=0)
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)

In [3]:
logreg=LogisticRegression().fit(x_train,y_train)
logreg.score(x_test,y_test)



0.88

# 交叉验证

In [4]:
from sklearn.datasets import load_iris
iris=load_iris()

In [5]:
from sklearn.model_selection import cross_val_score

In [6]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris=load_iris()
logreg=LogisticRegression()

In [7]:
scores=cross_val_score(logreg,iris.data,iris.target)



In [8]:
scores

array([0.96078431, 0.92156863, 0.95833333])

In [9]:
scores=cross_val_score(logreg,iris.data,iris.target,cv=5)



In [10]:
scores

array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])

## K折交叉验证

In [11]:
from sklearn.model_selection import KFold
kfold=KFold(n_splits=5)

In [12]:
cross_val_score(logreg,iris.data,iris.target,cv=kfold)



array([1.        , 0.93333333, 0.43333333, 0.96666667, 0.43333333])

In [13]:
kfold=KFold(n_splits=3)
cross_val_score(logreg,iris.data,iris.target,cv=kfold)



array([0., 0., 0.])

In [14]:
kfold=KFold(n_splits=3,shuffle=True,random_state=0)
cross_val_score(logreg,iris.data,iris.target,cv=kfold)



array([0.9 , 0.96, 0.96])

## 留一交叉验证

In [15]:
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()
scores=cross_val_score(logreg,iris.data,iris.target,cv=loo)













In [16]:
len(scores)

150

In [17]:
scores.mean()

0.9533333333333334

## 乱序切分

In [18]:
from sklearn.model_selection import ShuffleSplit
shuffle_split=ShuffleSplit(test_size=.5,train_size=.5,n_splits=10)

In [19]:
cross_val_score(logreg,iris.data,iris.target,cv=shuffle_split)



array([0.93333333, 0.98666667, 0.96      , 0.96      , 0.96      ,
       0.86666667, 0.94666667, 0.96      , 0.92      , 0.86666667])

# 网格搜索

## 手动遍历超参数

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [21]:
x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target,random_state=0)

In [22]:
best_score=0
for gamma in [0.001,0.01,0.1,1,10,100]:
    for c in [0.001,0.01,0.1,1,10,100]:
        svm=SVC(gamma=gamma,C=c)
        svm.fit(x_train,y_train)
        score=svm.score(x_test,y_test)
        if score > best_score:
            best_score=score
            best_parameters={'C':c,'gamma':gamma}
print('best score',best_score)
print('best parameters',best_parameters)

best score 0.9736842105263158
best parameters {'C': 100, 'gamma': 0.001}


## 训练集+验证集+测试集

In [23]:
from sklearn.svm import SVC

In [24]:
x_trainval,x_test,y_trainval,y_test=train_test_split(iris.data,iris.target,random_state=0)

In [25]:
x_train,x_valid,y_train,y_valid=train_test_split(x_trainval,y_trainval,random_state=1)

In [26]:
best_score=0
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm=SVC(gamma=gamma,C=C)
        svm.fit(x_train,y_train)
        score=svm.score(x_valid,y_valid)
        if score>best_score:
            best_score=score
            best_parameters={'gamma':gamma,'C':C}


In [27]:
best_parameters

{'gamma': 0.001, 'C': 10}

In [28]:
svm=SVC(**best_parameters)
svm.fit(x_trainval,y_trainval)
test_score=svm.score(x_test,y_test)
print('best score',best_score)
print('best parameters',best_parameters)
print('test score',test_score)

best score 0.9642857142857143
best parameters {'gamma': 0.001, 'C': 10}
test score 0.9210526315789473


## gridsearchcv grad_search + cross_validation

In [29]:
param_grid={'C':[0.001,0.01,0.1,1,10,100],
           'gamma':[0.001,0.01,0.1,1,10,100]}

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

grid_search=GridSearchCV(SVC(),param_grid,cv=5)

In [31]:
x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,random_state=0)

In [32]:
grid_search.fit(x_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [33]:
grid_search.score(x_test,y_test)

0.9736842105263158

In [34]:
grid_search.best_params_

{'C': 100, 'gamma': 0.01}

In [35]:
grid_search.best_score_

0.9732142857142857

In [36]:
grid_search.best_estimator_

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## 检视交叉验证的结果

In [38]:
grid_search.cv_results_

{'mean_fit_time': array([0.00062242, 0.00053997, 0.00048294, 0.00051036, 0.00050578,
        0.00047007, 0.00055604, 0.00050597, 0.00047884, 0.00046554,
        0.00060997, 0.00054011, 0.00046749, 0.00062881, 0.00043135,
        0.00045958, 0.00056334, 0.00057216, 0.00047231, 0.00041161,
        0.00032482, 0.00040827, 0.00058885, 0.00061393, 0.0003942 ,
        0.00030408, 0.00028505, 0.00034947, 0.00061378, 0.00060248,
        0.00029955, 0.00028429, 0.00032096, 0.00037169, 0.00060759,
        0.00065007]),
 'std_fit_time': array([1.29195647e-04, 3.49851624e-05, 2.18605786e-05, 4.80493830e-05,
        4.96358852e-05, 4.27720539e-06, 9.79095904e-05, 1.02733058e-04,
        2.27849340e-05, 5.03419009e-06, 1.05024897e-04, 3.91060546e-05,
        3.91455137e-05, 1.92128964e-04, 2.95254242e-06, 1.60295320e-05,
        4.28441305e-05, 3.10059306e-05, 3.73336394e-06, 2.39985088e-05,
        7.49954348e-06, 6.20849671e-05, 1.40399193e-05, 2.58043370e-05,
        6.65082034e-05, 2.53038390e-0

In [39]:
scores=grid_search.cv_results_['mean_test_score']
scores=np.array(scores).reshape(6,6)
#可以画成热力图

In [40]:
scores

array([[0.36607143, 0.36607143, 0.36607143, 0.36607143, 0.36607143,
        0.36607143],
       [0.36607143, 0.36607143, 0.36607143, 0.36607143, 0.36607143,
        0.36607143],
       [0.36607143, 0.69642857, 0.91964286, 0.95535714, 0.36607143,
        0.36607143],
       [0.69642857, 0.92857143, 0.96428571, 0.94642857, 0.91964286,
        0.50892857],
       [0.92857143, 0.96428571, 0.96428571, 0.9375    , 0.91964286,
        0.57142857],
       [0.96428571, 0.97321429, 0.95535714, 0.94642857, 0.91964286,
        0.57142857]])

## 手动切分

In [41]:
scores=cross_val_score(GridSearchCV(SVC(),param_grid,cv=5),iris.data,iris.target,cv=5)

In [42]:
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [44]:

def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_scores = []
    # 手动切分训练集和测试集(几折交叉)
    for training_samples, test_samples in outer_cv.split(X, y):
        # 初始化最好的参数存储的字典:
        best_parms = {}
        best_score = -np.inf
        # 遍历参数
        for parameters in parameter_grid:
            # 记录不同参数交叉验证实验得分
            cv_scores = []
            # 再把训练集做几折切分内部切分为 真正的训练集 和 验证集
            for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
                clf = Classifier(**parameters)
                clf.fit(X[inner_train], y[inner_train])
                score = clf.score(X[inner_test], y[inner_test])
                cv_scores.append(score)
            # 交叉验证的平均结果
            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                best_score = mean_score
                best_params = parameters
        clf = Classifier(**best_params)
        clf.fit(X[training_samples], y[training_samples])
        outer_scores.append(clf.score(X[test_samples], y[test_samples]))
    return outer_scores