## hold out cross validation

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [8]:
iris = datasets.load_iris()
iris.data.shape,iris.target.shape

((150, 4), (150,))

### 用 train_test_split 来随机划分数据集，其中 40% 用于测试集，有 60 条数据，60% 为训练集，有 90 条数据

In [10]:
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.4,random_state=0)
X_train.shape,y_train.shape

((90, 4), (90,))

In [11]:
X_test.shape,y_test.shape

((60, 4), (60,))

### 用 train 来训练，用 test 来评价模型的分数。

In [12]:
clf = svm.SVC(kernel='linear',C=1).fit(X_train,y_train)
clf.score(X_test,y_test)

0.9666666666666667

## k-fold cross validation

### 最简单的方法是直接调用 cross_val_score，这里用了 5 折交叉验证

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
clf = svm.SVC(kernel='linear',C=1)
scores = cross_val_score(clf,iris.data,iris.target,cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

### 得到最后平均分为 0.98，以及它的 95% 置信区间

In [15]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.98 (+/- 0.03)


#### 我们可以直接看一下 K-fold 是怎样划分数据的：
#### X 有四个数据，把它分成 2 折，
#### 结果中最后一个集合是测试集，前面的是训练集，
#### 每一行为 1 折：

In [16]:
import numpy as np
from sklearn.model_selection import KFold

In [17]:
X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
     print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]


#### 同样的数据 X，我们看 LeaveOneOut 后是什么样子，
#### 那就是把它分成 4 折，
#### 结果中最后一个集合是测试集，只有一个元素，前面的是训练集，
#### 每一行为 1 折：

In [20]:
from sklearn.model_selection import LeaveOneOut
X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X):
     print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]
