# K-fold cross-validation


## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.20,
    stratify=y,
    random_state=1)

# Constructing a pipeline object
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

## K-Fold cross-validation

In [17]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [18]:
genSplit = kf.split(X_train,y_train)
print(genSplit)

<generator object _BaseKFold.split at 0x7f3a56902dd0>


In [19]:
#Note that generator needs to be created since it is exhausted after used.
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain[0:10],"...", idxVal[0:10], "...") 

[46 47 48 49 50 51 52 53 54 55] ... [0 1 2 3 4 5 6 7 8 9] ...
[0 1 2 3 4 5 6 7 8 9] ... [46 47 48 49 50 51 52 53 54 55] ...
[0 1 2 3 4 5 6 7 8 9] ... [ 92  93  94  95  96  97  98  99 100 101] ...
[0 1 2 3 4 5 6 7 8 9] ... [138 139 140 141 142 143 144 145 146 147] ...
[0 1 2 3 4 5 6 7 8 9] ... [184 185 186 187 188 189 190 191 192 193] ...
[0 1 2 3 4 5 6 7 8 9] ... [230 231 232 233 234 235 236 237 238 239] ...
[0 1 2 3 4 5 6 7 8 9] ... [275 276 277 278 279 280 281 282 283 284] ...
[0 1 2 3 4 5 6 7 8 9] ... [320 321 322 323 324 325 326 327 328 329] ...
[0 1 2 3 4 5 6 7 8 9] ... [365 366 367 368 369 370 371 372 373 374] ...
[0 1 2 3 4 5 6 7 8 9] ... [410 411 412 413 414 415 416 417 418 419] ...


In [20]:
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

(409,) (46,) [151 258]
(409,) (46,) [146 263]
(409,) (46,) [153 256]
(409,) (46,) [154 255]
(409,) (46,) [155 254]
(410,) (45,) [154 256]
(410,) (45,) [155 255]
(410,) (45,) [157 253]
(410,) (45,) [153 257]
(410,) (45,) [152 258]


In [21]:
genSplit = kf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

Fold: 1, Training accuracy: 0.951, Validation accuracy: 0.978
Fold: 2, Training accuracy: 0.956, Validation accuracy: 0.935
Fold: 3, Training accuracy: 0.954, Validation accuracy: 0.913
Fold: 4, Training accuracy: 0.954, Validation accuracy: 0.957
Fold: 5, Training accuracy: 0.949, Validation accuracy: 0.935
Fold: 6, Training accuracy: 0.951, Validation accuracy: 0.911
Fold: 7, Training accuracy: 0.949, Validation accuracy: 0.978
Fold: 8, Training accuracy: 0.956, Validation accuracy: 0.956
Fold: 9, Training accuracy: 0.954, Validation accuracy: 0.956
Fold:10, Training accuracy: 0.951, Validation accuracy: 0.978

Training accuracy
Mean: 0.952
Std: 0.002

Crossed-validation accuracy
Mean: 0.950
Std: 0.024


## Stratified K-Folds cross-validation

In [22]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
print(skf)

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [23]:
genSplit = skf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

(409,) (46,) [153 256]
(409,) (46,) [153 256]
(409,) (46,) [153 256]
(409,) (46,) [153 256]
(409,) (46,) [153 256]
(410,) (45,) [153 257]
(410,) (45,) [153 257]
(410,) (45,) [153 257]
(410,) (45,) [153 257]
(410,) (45,) [153 257]


In [24]:
genSplit = skf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

Fold: 1, Training accuracy: 0.954, Validation accuracy: 0.978
Fold: 2, Training accuracy: 0.956, Validation accuracy: 0.935
Fold: 3, Training accuracy: 0.954, Validation accuracy: 0.957
Fold: 4, Training accuracy: 0.954, Validation accuracy: 0.935
Fold: 5, Training accuracy: 0.951, Validation accuracy: 0.913
Fold: 6, Training accuracy: 0.946, Validation accuracy: 0.956
Fold: 7, Training accuracy: 0.956, Validation accuracy: 0.933
Fold: 8, Training accuracy: 0.954, Validation accuracy: 0.956
Fold: 9, Training accuracy: 0.959, Validation accuracy: 0.933
Fold:10, Training accuracy: 0.951, Validation accuracy: 0.978

Training accuracy
Mean: 0.953
Std: 0.003

Crossed-validation accuracy
Mean: 0.947
Std: 0.020


## Stratified k-fold cross-validation (SKL)

In [56]:
from sklearn.model_selection import cross_validate

scores = cross_validate(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         scoring=['accuracy','f1'],
                         n_jobs=-1)

print("\nCrossed-validation")  
for k, v in scores.items():
    arrayPrint =   ', '.join([f'{el:3.2f}' for el in v])
    print(f"{k:15s}:", arrayPrint)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores['test_accuracy']):6.3f}")
print(f"Std:{np.std(scores['test_accuracy']):6.3f}")



Crossed-validation
fit_time       : 0.01, 0.00, 0.00, 0.01, 0.01, 0.01, 0.00, 0.01, 0.01, 0.01
score_time     : 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
test_accuracy  : 0.98, 0.93, 0.96, 0.93, 0.91, 0.96, 0.93, 0.96, 0.93, 0.98
test_f1        : 0.98, 0.95, 0.97, 0.95, 0.93, 0.97, 0.95, 0.96, 0.95, 0.98

Crossed-validation accuracy
Mean: 0.947
Std: 0.020


In [61]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)


arrayPrint =   ', '.join([f'{el:3.2f}' for el in scores])
print(f"Score:", arrayPrint)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores):6.3f}")
print(f"Std:{np.std(scores):6.3f}")

Score: 0.98, 0.93, 0.96, 0.93, 0.91, 0.96, 0.93, 0.96, 0.93, 0.98

Crossed-validation accuracy
Mean: 0.947
Std: 0.020


In [58]:
scores

array([0.97826087, 0.93478261, 0.95652174, 0.93478261, 0.91304348,
       0.95555556, 0.93333333, 0.95555556, 0.93333333, 0.97777778])