# K-fold cross-validation


## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.30,
    stratify=y,
    random_state=1)

# Constructing a pipeline object
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

## K-Fold cross-validation

In [2]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [3]:
genSplit = kf.split(X_train,y_train)
print(genSplit)

<generator object _BaseKFold.split at 0x7fdb0ad02c10>


In [4]:
#Note that generator needs to be created since it is exhausted after used.
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain[0:10],"...", idxVal[0:10], "...") 

[40 41 42 43 44 45 46 47 48 49] ... [0 1 2 3 4 5 6 7 8 9] ...
[0 1 2 3 4 5 6 7 8 9] ... [40 41 42 43 44 45 46 47 48 49] ...
[0 1 2 3 4 5 6 7 8 9] ... [80 81 82 83 84 85 86 87 88 89] ...
[0 1 2 3 4 5 6 7 8 9] ... [120 121 122 123 124 125 126 127 128 129] ...
[0 1 2 3 4 5 6 7 8 9] ... [160 161 162 163 164 165 166 167 168 169] ...
[0 1 2 3 4 5 6 7 8 9] ... [200 201 202 203 204 205 206 207 208 209] ...
[0 1 2 3 4 5 6 7 8 9] ... [240 241 242 243 244 245 246 247 248 249] ...
[0 1 2 3 4 5 6 7 8 9] ... [280 281 282 283 284 285 286 287 288 289] ...
[0 1 2 3 4 5 6 7 8 9] ... [320 321 322 323 324 325 326 327 328 329] ...
[0 1 2 3 4 5 6 7 8 9] ... [359 360 361 362 363 364 365 366 367 368] ...


In [5]:
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

(358,) (40,) [129 229]
(358,) (40,) [137 221]
(358,) (40,) [133 225]
(358,) (40,) [129 229]
(358,) (40,) [135 223]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [135 223]
(359,) (39,) [135 224]
(359,) (39,) [133 226]


In [6]:
genSplit = kf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

Fold: 1, Training accuracy: 0.969, Validation accuracy: 0.875
Fold: 2, Training accuracy: 0.961, Validation accuracy: 0.975
Fold: 3, Training accuracy: 0.958, Validation accuracy: 0.975
Fold: 4, Training accuracy: 0.964, Validation accuracy: 0.950
Fold: 5, Training accuracy: 0.964, Validation accuracy: 0.925
Fold: 6, Training accuracy: 0.958, Validation accuracy: 0.900
Fold: 7, Training accuracy: 0.955, Validation accuracy: 0.975
Fold: 8, Training accuracy: 0.961, Validation accuracy: 0.975
Fold: 9, Training accuracy: 0.955, Validation accuracy: 1.000
Fold:10, Training accuracy: 0.958, Validation accuracy: 0.974

Training accuracy
Mean: 0.960
Std: 0.004

Crossed-validation accuracy
Mean: 0.952
Std: 0.038


## Stratified K-Folds cross-validation

In [7]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
print(skf)

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [8]:
genSplit = skf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(358,) (40,) [133 225]
(359,) (39,) [134 225]
(359,) (39,) [134 225]


In [9]:
genSplit = skf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

Fold: 1, Training accuracy: 0.966, Validation accuracy: 0.925
Fold: 2, Training accuracy: 0.961, Validation accuracy: 0.950
Fold: 3, Training accuracy: 0.958, Validation accuracy: 0.975
Fold: 4, Training accuracy: 0.966, Validation accuracy: 0.925
Fold: 5, Training accuracy: 0.961, Validation accuracy: 0.950
Fold: 6, Training accuracy: 0.964, Validation accuracy: 0.875
Fold: 7, Training accuracy: 0.958, Validation accuracy: 0.975
Fold: 8, Training accuracy: 0.961, Validation accuracy: 0.975
Fold: 9, Training accuracy: 0.955, Validation accuracy: 1.000
Fold:10, Training accuracy: 0.958, Validation accuracy: 0.974

Training accuracy
Mean: 0.961
Std: 0.004

Crossed-validation accuracy
Mean: 0.952
Std: 0.034


## Stratified k-fold cross-validation (SKL)

- `cross_validate` allows for multiple score types.

In [10]:
from sklearn.model_selection import cross_validate

scores = cross_validate(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         scoring=['accuracy','f1'],
                         n_jobs=-1)

print("\nCrossed-validation")  
for k, v in scores.items():
    arrayPrint =   ', '.join([f'{el:3.2f}' for el in v])
    print(f"{k:15s}:", arrayPrint)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores['test_accuracy']):6.3f}")
print(f"Std:{np.std(scores['test_accuracy']):6.3f}")



Crossed-validation
fit_time       : 0.08, 0.10, 0.07, 0.05, 0.12, 0.11, 0.05, 0.05, 0.03, 0.01
score_time     : 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
test_accuracy  : 0.93, 0.95, 0.97, 0.93, 0.95, 0.88, 0.97, 0.97, 1.00, 0.97
test_f1        : 0.94, 0.96, 0.98, 0.94, 0.96, 0.89, 0.98, 0.98, 1.00, 0.98

Crossed-validation accuracy
Mean: 0.952
Std: 0.034


- `cross_val_score` produces results from one type of score.

In [11]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)


arrayPrint =   ', '.join([f'{el:3.2f}' for el in scores])
print(f"Score:", arrayPrint)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores):6.3f}")
print(f"Std:{np.std(scores):6.3f}")

Score: 0.93, 0.95, 0.97, 0.93, 0.95, 0.88, 0.97, 0.97, 1.00, 0.97

Crossed-validation accuracy
Mean: 0.952
Std: 0.034
