## Understanding Cross Validation

K-fold cross validation - for regression

Stratified K-fold cross validation - for classification 

where there are K-1 training sets and 1 validation sets 

cv = 4 and cv = Kfold(n_splits = 1) both are the same but we use Kfold() for reproducibility

In [3]:
import pandas as pd 
import numpy as np

In [None]:
from sklearn.datasets import make_classification

# Create a toy classification dataset for experimentation

X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=5,
    n_redundant=5,
    n_classes=2,
    flip_y=0.05,
    random_state=42
)

In [None]:
# Putting the features into a dataframe 

X_df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(X.shape[1])])

In [7]:
X_df 

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20
0,1.716802,-1.878981,3.921550,-0.100696,0.838014,1.567688,1.174814,0.451348,-0.041660,-1.473126,0.027661,0.915208,1.565791,1.924172,1.735241,-1.181194,-0.946314,-0.072822,-0.327795,2.579122
1,-0.601438,-0.245848,4.619435,1.900649,1.152913,-0.432987,1.106457,0.362996,0.612172,-1.465273,-1.111363,1.648790,0.604572,1.581122,3.387507,-1.173630,0.393295,-0.993921,-1.337674,1.219754
2,1.202382,-0.679889,2.508501,0.144310,-0.319826,0.071811,-0.833598,0.073863,0.191890,-1.486362,-1.718181,-0.750171,-0.100933,2.478345,2.170866,1.193587,-1.833706,-1.340489,-0.795970,0.018092
3,-2.686179,0.347916,-1.706339,2.735960,0.695166,-0.323054,-0.629886,-0.866571,1.829563,4.660337,-1.707848,1.715499,0.177424,-4.444163,0.194818,0.349012,0.568770,3.285724,1.780639,1.804136
4,-3.563403,-0.314691,-1.612481,0.530683,-0.482984,-2.173445,0.037575,-1.207759,-2.380739,3.626901,-2.243869,0.860520,-1.343523,-5.531902,-0.555122,0.809113,-0.663436,-0.849721,0.129368,1.600213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.378890,-0.045044,1.280273,-0.480214,1.195177,0.262212,0.209349,-1.492102,0.392456,-0.982001,-0.315629,-0.963783,-0.196828,-0.865926,-1.222709,1.295207,0.268851,-0.162062,-0.432660,2.706923
996,-1.859479,0.396320,0.447962,0.596081,0.393991,-0.882742,-1.218176,-0.819715,-1.570475,0.389776,-0.442821,1.356427,0.026493,0.991942,2.335046,0.336263,-0.950105,0.290038,-0.630232,-2.127964
997,-0.661205,0.894930,3.915663,-0.435711,-0.262234,-0.279091,0.040308,0.049095,-0.156409,-1.263866,-0.698762,1.713996,0.716293,1.870986,3.286673,-1.351708,-0.032874,0.685890,0.591091,0.440655
998,-0.390342,1.023057,-0.514006,-1.348177,0.193379,0.324984,1.900268,-0.979926,0.391128,1.392815,0.086694,0.550763,0.318231,-2.100771,-0.920630,-1.957339,0.015682,-0.668143,1.643542,1.743802


In [None]:
from sklearn.model_selection import cross_val_score

# the cross_val score is a function that is a part of the model_selection class 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

ldr = LinearDiscriminantAnalysis(solver='svd')

scores = cross_val_score(ldr, X, y, cv=5)

# params: cv represents the cross-validation splitting strategy

# what is the cross validation splitting strategy?

# params: scoring represents the strategy to evaluate the performance of the estimator, 
# if None is given the default is the estimators scoring value 

print(scores.mean())
print(scores.std())

0.812
0.03558089374931437
