# K-fold cross-validation


## Setting up

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.20,
    stratify=y,
    random_state=1)

# Constructing a pipeline object
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

## K-Fold cross-validation

In [4]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, random_state=1)
print(kf)

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [None]:
genSplit = kf.split(X_train,y_train)
print(genSplit)

In [None]:
#Note that generator needs to be created since it is exhausted after used.
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain[0:10],"...", idxVal[0:10], "...") 

In [None]:
genSplit = kf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

In [None]:
genSplit = kf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

## Stratified K-Folds cross-validation

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, random_state=1)
print(skf)

In [None]:
genSplit = skf.split(X_train,y_train)

for idxTrain, idxVal in genSplit:
    print(idxTrain.shape, idxVal.shape, np.bincount(y_train[idxTrain])) 

In [None]:
genSplit = skf.split(X_train,y_train)

trainScores = []
valScores = []
for count, (idxTrain, idxVal) in enumerate(genSplit):
    # Training
    pipe_lr.fit(X_train[idxTrain], y_train[idxTrain])
    # Traning score
    trainScore = pipe_lr.score(X_train[idxTrain], y_train[idxTrain])
    trainScores.append(trainScore)
    # Validation score
    valScore = pipe_lr.score(X_train[idxVal], y_train[idxVal])
    valScores.append(valScore)
    # Printing
    print(f"Fold:{count+1:2d}, Training accuracy:{trainScore:6.3f}, Validation accuracy:{valScore:6.3f}")

print("\nTraining accuracy")  
print(f"Mean:{np.mean(trainScores):6.3f}")
print(f"Std:{np.std(trainScores):6.3f}")

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(valScores):6.3f}")
print(f"Std:{np.std(valScores):6.3f}")

## Stratified k-fold cross-validation (SKL)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores):6.3f}")
print(f"Std:{np.std(scores):6.3f}")