Cross-Validation strategies

In [1]:
# TO DEAL WITH FUTURE WARNINGS
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


In [2]:
# Using the iris data
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression #for classification problem

In [3]:
iris = load_iris()
lr = LogisticRegression()
#the parameters of the cross_val_score function are the model we want to 
#evaluate
score = cross_val_score(lr,iris.data,iris.target)
print('Cross Validation scores: {}'.format(score))

Cross Validation scores: [0.96078431 0.92156863 0.95833333]


In [4]:
print('Average CV score: {:.2f}'.format(score.mean()))

Average CV score: 0.95


In [6]:
print('iris labels:\n{}'.format(iris.target))

iris labels:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [7]:
# we can use stratified k-fold cv such that splits proportions are the same
# in each fold
# But we going to use standard k-fold by importing kfold splitter
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5)
print('CV score:\n{}'.format(cross_val_score(lr,iris.data,iris.target,cv=kfold)))

CV score:
[1.         0.93333333 0.43333333 0.96666667 0.43333333]


In [8]:
kfold = KFold(n_splits = 3)
print('CV score:\n{}'.format(cross_val_score(lr,iris.data,iris.target,cv=kfold)))

CV score:
[0. 0. 0.]


In [9]:
#from the preceding snippet each fold corresponds to 1 
#class in the iris dataset nothing learned,
#to resolve the we shuffle the data
kfold = KFold(n_splits=3,shuffle=True,random_state=0)
print('CV scores:\n{}'.format(cross_val_score(lr,iris.data,iris.target,cv=kfold)))

CV scores:
[0.9  0.96 0.96]


In [10]:
# Leave-one-out cv as another cv method
#think of loo as kfold cv where each flod is a single sample
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
score = cross_val_score(lr,iris.data,iris.target,cv=loo)
print('CV iterations: ', len(score))
print('Mean accuracy: {:.2f}'.format(score.mean()))

CV iterations:  150
Mean accuracy: 0.95


In [14]:
#Shuffle-split cv another cv strategy
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(test_size=.5,train_size=.5,n_splits=10)
score = cross_val_score(lr,iris.data,iris.target,cv=ss)
print('CV score:\n{}'.format(score))

CV score:
[0.96       0.97333333 0.94666667 0.92       0.93333333 0.97333333
 0.94666667 0.93333333 0.82666667 0.94666667]


Cross validation is a statistical method of evaluating generalization performance, considered more stable and thorough than using a split into training and test sets.
use a specific cross validation strategy depending on the problem or what you want to achieve as goal 