<a href="https://colab.research.google.com/github/kaybrian/ML_/blob/main/K_Fold_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
digits = load_digits()

# stop the warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [12]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=101)

In [13]:
# lets start with the logistic classifier to see
lr = LogisticRegression()
lr.fit(X_train, y_train)

# get the score of the classifier
lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9648148148148148

In [14]:
#  using the svm (support vector machines )
svm = SVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.9888888888888889

In [15]:
# using the random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9777777777777777

In [16]:
# using the k fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [17]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [18]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [21]:
# get the scores of the other models
print("Logistic Regression score: ", get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
print("SVM score: ", get_score(SVC(), X_train, X_test, y_train, y_test))
print("Random Forest score: ", get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))


Logistic Regression score:  0.9648148148148148
SVM score:  0.9888888888888889
Random Forest score:  0.9796296296296296


In [26]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
folds = StratifiedKFold(n_splits=5)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=50), X_train, X_test, y_train, y_test))

# print the scores
print("Logistic Regression score: ", scores_logistic)
print("SVM score: ", scores_svm)
print("Random Forest score: ", scores_rf)


Logistic Regression score:  [0.9222222222222223, 0.8694444444444445, 0.9415041782729805, 0.9387186629526463, 0.8969359331476323]
SVM score:  [0.9611111111111111, 0.9444444444444444, 0.9832869080779945, 0.9888579387186629, 0.9387186629526463]
Random Forest score:  [0.9305555555555556, 0.9222222222222223, 0.9554317548746518, 0.9610027855153204, 0.9220055710306406]


In [28]:
# using the cross val score
from sklearn.model_selection import cross_val_score

print(cross_val_score(LogisticRegression(), digits.data, digits.target, cv=3))
print(cross_val_score(SVC(), digits.data, digits.target, cv=3))
print(cross_val_score(RandomForestClassifier(), digits.data, digits.target, cv=3))

[0.92153589 0.94156928 0.91652755]
[0.96494157 0.97996661 0.96494157]
[0.93989983 0.95492487 0.91819699]


In [30]:
# you can also compare the same classifer with different parameter
print(cross_val_score(RandomForestClassifier(n_estimators=10), digits.data, digits.target, cv=3))
print(cross_val_score(RandomForestClassifier(n_estimators=20), digits.data, digits.target, cv=3))
print(cross_val_score(RandomForestClassifier(n_estimators=30), digits.data, digits.target, cv=3))
print(cross_val_score(RandomForestClassifier(n_estimators=40), digits.data, digits.target, cv=3))
print(cross_val_score(RandomForestClassifier(n_estimators=50), digits.data, digits.target, cv=3))

[0.89816361 0.92320534 0.89315526]
[0.93489149 0.93989983 0.9148581 ]
[0.91819699 0.93155259 0.92487479]
[0.92988314 0.94824708 0.91318865]
[0.94323873 0.94824708 0.93155259]
