# K-fold cross-validation


## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.30,
    stratify=y,
    random_state=1)

# Classifier
lr = LogisticRegression(random_state=1)

## Stratified k-fold cross-validation (SKL)

- `cross_validate` allows for multiple score types.

In [10]:
from sklearn.model_selection import cross_validate

scores = cross_validate(estimator=lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         scoring=['accuracy','f1'],
                         n_jobs=-1)

print("\nCrossed-validation")  
for k, v in scores.items():
    arrayPrint =   ', '.join([f'{el:3.2f}' for el in v])
    print(f"{k:15s}:", arrayPrint)

print("\nCrossed-validation accuracy")  
print(f"Mean:{np.mean(scores['test_accuracy']):6.3f}")
print(f"Std:{np.std(scores['test_accuracy']):6.3f}")



Crossed-validation
fit_time       : 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01
score_time     : 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
test_accuracy  : 0.93, 0.95, 0.97, 0.93, 0.95, 0.88, 0.97, 0.97, 1.00, 0.97
test_f1        : 0.94, 0.96, 0.98, 0.94, 0.96, 0.89, 0.98, 0.98, 1.00, 0.98

Crossed-validation accuracy
Mean: 0.952
Std: 0.034
