In [1]:
# cross val classifier 

import pandas as pd 
import numpy as np 

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import root_mean_squared_error, classification_report
from sklearn.model_selection import cross_val_score, KFold

In [2]:
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

In [3]:
classification = RandomForestClassifier(random_state=23) 
final_class = cross_val_score(estimator = classification, X=X, y=y, scoring="f1", 
                               cv=10, n_jobs= 1,
                              verbose=5).mean()


final_class

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.986) total time=   0.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] END ................................ score: (test=0.901) total time=   0.4s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV] END ................................ score: (test=0.959) total time=   0.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s


[CV] END ................................ score: (test=0.960) total time=   0.4s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.1s remaining:    0.0s


[CV] END ................................ score: (test=1.000) total time=   0.5s
[CV] END ................................ score: (test=0.986) total time=   0.3s
[CV] END ................................ score: (test=0.986) total time=   0.3s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.971) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.2s finished


0.9721273857159236

### Simple Cross val score 

In [4]:
regressor = RandomForestRegressor(random_state=23) # init the regressor 
final_rmse = -cross_val_score(estimator=regressor, X=X, y=y, scoring='neg_root_mean_squared_error',
                               cv=10, n_jobs=1, verbose=5).mean()

final_rmse

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ............................... score: (test=-0.224) total time=   0.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] END ............................... score: (test=-0.246) total time=   0.6s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] END ............................... score: (test=-0.259) total time=   0.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.1s remaining:    0.0s


[CV] END ............................... score: (test=-0.164) total time=   0.6s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.8s remaining:    0.0s


[CV] END ............................... score: (test=-0.161) total time=   0.7s
[CV] END ............................... score: (test=-0.159) total time=   0.6s
[CV] END ............................... score: (test=-0.179) total time=   0.6s
[CV] END ............................... score: (test=-0.145) total time=   0.6s
[CV] END ............................... score: (test=-0.181) total time=   0.6s
[CV] END ............................... score: (test=-0.132) total time=   0.6s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.0s finished


0.18494344364567955

### KFOLD AND STRATIFIED KFOLD SCRATCH

In [5]:
from sklearn.metrics import f1_score

# init the kfold object
folds = 8 
fold = 1
rfr_kfold = KFold(n_splits=folds, shuffle=True, random_state=23)
scores = [] # track score list 
# init the model 
classifier = RandomForestClassifier(random_state=23, n_estimators=2)

for train_index, test_index in rfr_kfold.split(X=X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

# fit the model 
    classifier.fit(X_train, y_train)
    test_preds = classifier.predict(X_test) # predict test
    train_preds = classifier.predict(X_train) # predict train 
    score = f1_score(y_test, test_preds) # get the test score
    train_score = f1_score(y_train, train_preds) # get the train score 
    scores.append(score) # update score 
    print(f'Completed fold: {fold}/{folds}... test score: {score}... Train Score: {train_score}')
    fold += 1



print(f'Score: {scores}')
print(f'='*65)
print(f'Final score: {sum(scores)/len(scores)}')

Completed fold: 1/8... test score: 0.946236559139785... Train Score: 0.9818780889621087
Completed fold: 2/8... test score: 0.9438202247191011... Train Score: 0.9701986754966887
Completed fold: 3/8... test score: 0.8888888888888888... Train Score: 0.9873817034700315
Completed fold: 4/8... test score: 0.9195402298850575... Train Score: 0.9820554649265906
Completed fold: 5/8... test score: 0.9397590361445783... Train Score: 0.9787928221859706
Completed fold: 6/8... test score: 0.9111111111111111... Train Score: 0.9768211920529801
Completed fold: 7/8... test score: 0.9382716049382716... Train Score: 0.9585406301824212
Completed fold: 8/8... test score: 0.8941176470588236... Train Score: 0.9855072463768116
Score: [0.946236559139785, 0.9438202247191011, 0.8888888888888888, 0.9195402298850575, 0.9397590361445783, 0.9111111111111111, 0.9382716049382716, 0.8941176470588236]
Final score: 0.9227181627357022


### KFold with cross val without shuffle 

In [7]:

rf_kfold = KFold(n_splits=9)

regressor = RandomForestClassifier(random_state=23) # init the regressor
final_score = cross_val_score(estimator=regressor, X=X, y=y,
                scoring="f1", cv=rf_kfold.split(X), n_jobs=1,
                verbose= 3).mean()
final_score

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.889) total time=   0.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] END ................................ score: (test=0.921) total time=   0.2s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] END ................................ score: (test=0.949) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.966) total time=   0.2s
[CV] END ................................ score: (test=0.988) total time=   0.2s
[CV] END ................................ score: (test=0.970) total time=   0.2s
[CV] END ................................ score: (test=0.979) total time=   0.3s
[CV] END ................................ score: (test=1.000) total time=   0.4s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.9s finished


0.9607395775699012

### STRATIFIED 