# Harish Practise: Stratified K Fold Cross Validation

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Generating a synthetic dataset

In [3]:
X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    weights = [0.9, 0.1],
    random_state=42
)

# Spliting data into training and testing Sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.35,random_state=45)

In [5]:
from collections import Counter

Counter(y)

Counter({0: 897, 1: 103})

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index],X[test_index]
    y_train, y_test = y[train_index],y[test_index]
    print(Counter(y_test))

Counter({0: 177, 1: 23})
Counter({0: 179, 1: 21})
Counter({0: 183, 1: 17})
Counter({0: 181, 1: 19})
Counter({0: 177, 1: 23})


In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X,y):
        X_train, X_test = X[train_index],X[test_index]
        y_train, y_test = y[train_index],y[test_index]
        print(Counter(y_test))

Counter({0: 180, 1: 20})
Counter({0: 180, 1: 20})
Counter({0: 179, 1: 21})
Counter({0: 179, 1: 21})
Counter({0: 179, 1: 21})


# Cross Validation on Logistic Regression

In [12]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(), X, y, cv=skf, scoring="accuracy")

array([0.915, 0.91 , 0.895, 0.895, 0.895])

# Cross Validation on Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

cross_val_score(DecisionTreeClassifier(), X, y, cv=skf, scoring="accuracy")

array([0.91 , 0.905, 0.87 , 0.85 , 0.9  ])

# Cross Validation on Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

cross_val_score(RandomForestClassifier(), X, y, cv=skf, scoring="accuracy")

array([0.92 , 0.925, 0.92 , 0.91 , 0.92 ])

In [16]:
cross_val_score(RandomForestClassifier(), X, y, cv=5, scoring="accuracy")

array([0.925, 0.905, 0.92 , 0.925, 0.93 ])

When you supply int parameter in cv and if the estimater is a classifier, it will be default use Stratified K Fold where k is set to the number that you specified for cvWhen you supply int parameter in cv and if the estimater is a classifier, it will be default use Stratified K Fold where k is set to the number that you specified for cv