In [9]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import LeavePOut, StratifiedKFold, ShuffleSplit, cross_val_score, train_test_split

In [10]:
# load data from sklearn 

iris = datasets.load_iris()
X = iris.data
y = iris.target
model = LogisticRegression()

In [11]:
# Monte carlo cross validation

shuffle = ShuffleSplit(test_size=0.2, n_splits=200)
val_score1 = cross_val_score(model, X, y, cv=shuffle).mean()
print("Monte carlo cross validation score is : ", val_score1)

Monte carlo cross validation score is :  0.9490000000000001


In [12]:
# leave p out cross validation

lp = LeavePOut(p=2)
lp.get_n_splits(X)
val_score2 = cross_val_score(model, X, y, cv=lp).mean()
print("Leave P Out cross validation score is : ", val_score2)

Leave P Out cross validation score is :  0.9553467561521253


In [13]:
# Stratified 3-fold cross-validation

s = StratifiedKFold(n_splits=3)
s.get_n_splits(X)
val_score3 = cross_val_score(model, X, y, cv=s).mean()
print("Stratified 3 fold cross validation score is : ", val_score3)

Stratified 3 fold cross validation score is :  0.9468954248366014


In [14]:
# Hold out cross validation

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.7, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
val_score4 = accuracy_score(y_pred, y_test)
print("Cross validation on Hold out cross validation is : ", val_score4)

Cross validation on Hold out cross validation is :  0.8333333333333334


In [15]:
# create table of cross validation score

df = pd.DataFrame([val_score1, val_score2, val_score3, val_score4], 
                  index=['Monte Carlo', 'Leave P-Out', 'Stratified 3-fold', 'Hold out'],
                  columns=['Cross Validation score'])
df

Unnamed: 0,Cross Validation score
Monte Carlo,0.949
Leave P-Out,0.955347
Stratified 3-fold,0.946895
Hold out,0.833333


## Analysis

In [16]:
# Leave p-out cross method works well on small dataset because it use p samples as validation and n-p sample as training set. 
# In leave p out cross validation all samples get used as training and validation sample.
# It is exhaustive cross-validation technique

# Monte carlo cross validation randomly split training and validation data.
# it is also called shuffled split cross validation.
# it is not suitable for imbalance dataset because  may happen that the training set may don’t have the class of data
# that is in the test set, and the model won’t be able to generalize for unseen data.

# Stratified k-fold cross validation works for imbalance dataset.
# In this technique, each fold will have the same ratio of instances of target variable as in the whole datasets.