# Harish practise: K fold  Cross Validation

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Generating a synthetic dataset using make_classification

In [5]:
X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

# Spliting data into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69       130
           1       0.66      0.74      0.70       120

    accuracy                           0.70       250
   macro avg       0.70      0.70      0.70       250
weighted avg       0.70      0.70      0.70       250



In [12]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=45)

for train_index, test_index in kf.split([50,60,70,80,90]):
    print(train_index, test_index)

[0 2 3 4] [1]
[0 1 2 3] [4]
[1 2 3 4] [0]
[0 1 3 4] [2]
[0 1 2 4] [3]


In [13]:
model = LogisticRegression()
for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    print(model.score(X_test,y_test))

0.705
0.675
0.675
0.72
0.66


### Cross validation on Logistic Regression

In [14]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(), X, y, cv=kf, scoring="accuracy")

array([0.705, 0.675, 0.675, 0.72 , 0.66 ])

### Cross validation on Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

cross_val_score(DecisionTreeClassifier(), X, y, cv=kf, scoring="accuracy")

array([0.795, 0.825, 0.78 , 0.78 , 0.825])

### Cross validation on Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=10), X, y, cv=kf, scoring="accuracy")

array([0.855, 0.855, 0.845, 0.82 , 0.895])

### Cross validation to evaluate same model with different parameters

Random Forest on 20 Trees

In [21]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=20), X, y, cv=kf, scoring="accuracy")

array([0.855, 0.85 , 0.9  , 0.855, 0.895])

Random Forest on 30 Trees

In [22]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=30), X, y, cv=kf, scoring="accuracy")

array([0.88 , 0.88 , 0.87 , 0.87 , 0.875])

### Using cross_validate to evaluate multiple metrics

In [24]:
from sklearn.model_selection import cross_validate

cross_validate(LogisticRegression(), X, y, cv=kf, scoring=["accuracy", "roc_auc"])

{'fit_time': array([0.00697947, 0.00498748, 0.00598431, 0.00398946, 0.00598407]),
 'score_time': array([0.21542358, 0.00499725, 0.0039885 , 0.00299191, 0.00199509]),
 'test_accuracy': array([0.705, 0.675, 0.675, 0.72 , 0.66 ]),
 'test_roc_auc': array([0.75888889, 0.75157642, 0.77941325, 0.79626656, 0.73191919])}