In [2]:
import pandas as pd
import numpy as np

In [3]:
data= pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [4]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

In [None]:
# Guideline
# Cross-validation is applied on entire dataset (Pre-modelling | EDA)
# 1. Get the score threshold/benchmark
# 2. To understand the optimal score possible
# 3. To extract the best training sample that may provide the optimal score

In [None]:
#List of algos for Classification we learned till date
# 1. LogisticRegression
# 2. KNeighborsClassifier
# 3. DecisionTreeClassifier
# 4. RandomForestClassifier
# 5. BaggingClassifier with LogisticRegression
# 6. BaggingClassifier with KNeighborsClassifier
# 7. BaggingClassifier with SVC
# 8. SVC


#Output in a Pandas DF
# Algorithm | CV value | CL | SL | Optimal Score

#CV=5 and CV=10

In [6]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model = LogisticRegression()

scores = cross_val_score(model,
                         features,
                         label,
                         cv=5) #In Theory, CV value can be any natural number. However most used values are 5 and 10.

In [7]:
scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [None]:
# Benchmark Score
print("Minimum Threshold score (CL) : ", scores.mean())
print("Suggested SL value for the given dataset is ", 1 - scores.mean())

In [None]:
# Benchmark Score
print("Minimum Threshold score (CL) : ", scores.min())
print("Suggested SL value for the given dataset is ", 1 - scores.min())

In [None]:
# Optimal Score
print("Possible Optimal Score is ", scores.max())

In [None]:
np.random.seed(1)
np.random.randint(1,10,1)

In [8]:
# 3. To extract the best training sample that may provide the optimal score

CL = scores.mean()

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize K-Fold Cross Validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              shuffle=True,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features):
  tracker +=1

  X_train,X_test = features[train],features[test]
  y_train,y_test = label[train],label[test]

  model.fit(X_train,y_train)

  if model.score(X_test,y_test) >= CL:
    print("Test Score {} Train Score {} for Sample Split {}".format(model.score(X_test,y_test),model.score(X_train,y_train),tracker))



Test Score 1.0 Train Score 0.9666666666666667 for Sample Split 5


In [9]:
#Extract Best Sample

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize K-Fold Cross Validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              shuffle=True,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features):
  tracker +=1
  if tracker == 5:
    X_train,X_test,y_train,y_test=features[train],features[test],label[train],label[test]



In [None]:
X_train

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

results = []
models = [{'model_name': 'LogisticRegression', 'instance': LogisticRegression()},{'model_name': 'KNeighborsClassifier', 'instance': KNeighborsClassifier()},
          {'model_name': 'DecisionTreeClassifier', 'instance':DecisionTreeClassifier()},{'model_name': 'SVC', 'instance': SVC()}, {'model_name': 'Bagging(LogisticRegression)', 'instance':  BaggingClassifier(LogisticRegression())},
          {'model_name': 'Bagging(KNeighborsClassifier)', 'instance': BaggingClassifier(KNeighborsClassifier())}, ]
cvs = [5, 10]
for model in models:
    for cvs_number in cvs:
        scores = cross_val_score(model['instance'],
                             features,
                             label,
                             cv=cvs_number)
        model_name = model['model_name']
        cvs_str = f'cvs: {str(cvs_number)}'
        results.append({'Model': model_name, 'CVS': cvs_str, 'Scores': scores.mean()})

results_df = pd.DataFrame(results)

        # print(f'model name: {model["model_name"]}, with cv of {cvs_number},  scores: {scores.mean()}')
print(results_df)
#%%
best_model = results_df.groupby('Model')['Scores'].mean().idxmax()
best_cvs = results_df[results_df['Model'] == best_model]['CVS'].values[0]

print(f'the best model is: {best_model} with a cvs of: {best_cvs}')

                            Model      CVS    Scores
0              LogisticRegression   cvs: 5  0.973333
1              LogisticRegression  cvs: 10  0.973333
2            KNeighborsClassifier   cvs: 5  0.973333
3            KNeighborsClassifier  cvs: 10  0.966667
4          DecisionTreeClassifier   cvs: 5  0.966667
5          DecisionTreeClassifier  cvs: 10  0.953333
6                             SVC   cvs: 5  0.966667
7                             SVC  cvs: 10  0.973333
8     Bagging(LogisticRegression)   cvs: 5  0.960000
9     Bagging(LogisticRegression)  cvs: 10  0.960000
10  Bagging(KNeighborsClassifier)   cvs: 5  0.966667
11  Bagging(KNeighborsClassifier)  cvs: 10  0.966667
the best model is: LogisticRegression with a cvs of: cvs: 5


In [11]:
results_df

Unnamed: 0,Model,CVS,Scores
0,LogisticRegression,cvs: 5,0.973333
1,LogisticRegression,cvs: 10,0.973333
2,KNeighborsClassifier,cvs: 5,0.973333
3,KNeighborsClassifier,cvs: 10,0.966667
4,DecisionTreeClassifier,cvs: 5,0.966667
5,DecisionTreeClassifier,cvs: 10,0.953333
6,SVC,cvs: 5,0.966667
7,SVC,cvs: 10,0.973333
8,Bagging(LogisticRegression),cvs: 5,0.96
9,Bagging(LogisticRegression),cvs: 10,0.96


## **StratifiedShuffleSplit**

In [12]:
#This method allows user to set the test_size


CL = scores.mean()

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize StratifiedShuffleSplit Cross Validation
from sklearn.model_selection import StratifiedShuffleSplit
kfold = StratifiedShuffleSplit(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              test_size=0.2,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features,label):
  tracker +=1

  X_train,X_test = features[train],features[test]
  y_train,y_test = label[train],label[test]

  model.fit(X_train,y_train)

  if model.score(X_test,y_test) >= CL:
    print("Test Score {} Train Score {} for Sample Split {}".format(model.score(X_test,y_test),model.score(X_train,y_train),tracker))



Test Score 1.0 Train Score 0.9666666666666667 for Sample Split 3


In [19]:
#Extract Best Sample

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize K-Fold Cross Validation
from sklearn.model_selection import StratifiedShuffleSplit
kfold = StratifiedShuffleSplit(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              test_size=0.2,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features,label):
  tracker +=1
  if tracker == 3:
    X_train,X_test,y_train,y_test=features[train],features[test],label[train],label[test]



In [20]:
len(X_train)

120

In [21]:
len(X_test)

30

In [None]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py