# Cross-Validation

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Cross-Validation Basics

In [None]:
from helpers.plot_cross_validation import plot_cross_validation

plot_cross_validation()

## Cross-Validation in scikit-learn

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [None]:
iris = load_iris()
logreg = LogisticRegression(max_iter=10000)

scores = cross_val_score(logreg, iris.data, iris.target)
print(f"Cross-validation scores: {scores}")

In [None]:
scores = cross_val_score(logreg, iris.data, iris.target, cv=3)
print(f"Cross-validation scores: {scores}")

In [None]:
print(f"Average cross-validation score: {scores.mean():.2f}")

## Benefits of Cross-Validation

## Stratified k-Fold Cross-Validation and Other Strategies

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
iris.target

In [None]:
from helpers.plot_cross_validation import plot_stratified_cross_validation
plot_stratified_cross_validation()

### More control over cross-validation

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)

In [None]:
print(f"Cross-validation scores:\n{cross_val_score(logreg, iris.data, iris.target, cv=kfold)}")

In [None]:
kfold = KFold(n_splits=3)

print(f"Cross-validation scores:\n{cross_val_score(logreg, iris.data, iris.target, cv=kfold)}")

In [None]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

print(f"Cross-validation scores:\n{cross_val_score(logreg, iris.data, iris.target, cv=kfold)}")

### Leave-one-out cross-validation

In [None]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)

print("Number of cv iterations: ", len(scores))
print(f"Mean accuracy: {scores.mean():.2f}")

### Shuffle-split cross-validation

In [None]:
from helpers.plot_cross_validation import plot_shuffle_split
plot_shuffle_split()

In [None]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)

print(f"Cross-validation scores:\n{scores}")

### Cross-validation with groups

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import GroupKFold

# create synthetic dataset
X, y = make_blobs(n_samples=12, random_state=0)

# assume the first three samples belong to the same group,
# then the next four, etc.
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

scores = cross_val_score(logreg, X, y, groups=groups, cv=GroupKFold(n_splits=3))
print(f"Cross-validation scores:\n{scores}")

In [None]:
from helpers.plot_cross_validation import plot_group_kfold
plot_group_kfold()