K-fold Cross-Validation é uma técnica de validação de modelos que melhora a avaliação do desempenho ao dividir o conjunto de dados em K partes (ou "folds").

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, make_scorer
from sklearn.metrics import recall_score

In [2]:
iris = datasets.load_iris()
iris.data.shape, iris.target.shape

((150, 4), (150,))

# 4-fold

In [3]:
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=4)
scores

array([1.        , 0.97368421, 0.97297297, 0.97297297])

# 5-fold

In [4]:
cross_val_score(clf, iris.data, iris.target,
                scoring = 'f1_weighted',
                cv=5)

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

In [5]:
params = { 
    'average' : 'micro'
}

In [6]:
scorer = make_scorer(precision_score, average='weighted')

In [7]:
cross_val_score(clf, iris.data, iris.target,
                scoring = scorer,
                cv=5)

array([0.96969697, 1.        , 0.96969697, 0.96969697, 1.        ])

In [8]:
scorer = make_scorer(recall_score, average='weighted')

In [9]:
cross_val_score(clf, iris.data, iris.target,
                scoring = scorer,
                cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

## Outro Exemplo

In [10]:
from sklearn.model_selection import KFold

In [11]:
X = ["a", "b", "e", "f"]
kf = KFold(n_splits=4)
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


In [12]:
X = np.array([[0., 0.],
              [1., 1.],
              [-1., -1.],
              [2., 2.]])
y = np.array([0,
              1,
              0,
              1])

X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

In [13]:
X_train

array([[ 0.,  0.],
       [ 1.,  1.],
       [-1., -1.]])

In [14]:
X_test

array([[2., 2.]])

In [15]:
y_train

array([0, 1, 0])

In [16]:
y_test

array([1])

Obs: Se k = n de amostras, isso é equivalente à Estratégia "Leave One Out"
