In [1]:
import pandas as pd
import numpy as np

In [2]:
data= pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [3]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

In [5]:
# Guideline
# Cross-validation is applied on entire dataset (Pre-modelling | EDA)
# 1. Get the score threshold/benchmark
# 2. To understand the optimal score possible
# 3. To extract the best training sample that may provide the optimal score

In [6]:
#List of algos for Classification we learned till date
# 1. LogisticRegression
# 2. KNeighborsClassifier
# 3. DecisionTreeClassifier
# 4. RandomForestClassifier
# 5. BaggingClassifier with LogisticRegression
# 6. BaggingClassifier with KNeighborsClassifier
# 7. BaggingClassifier with SVC
# 8. SVC


#Output in a Pandas DF
# Algorithm | CV value | CL | SL | Optimal Score

#CV=5 and CV=10

In [9]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


model = LogisticRegression()

scores = cross_val_score(model,
                         features,
                         label,
                         cv=5) 
#In Theory, CV value can be any natural number. However most used values are 5 and 10.
print(classification_report(label,model.predict(features)))

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [15]:
scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [None]:
# Benchmark Score
print("Minimum Threshold score (CL) : ", scores.mean())
print("Suggested SL value for the given dataset is ", 1 - scores.mean())

Minimum Threshold score (CL) :  0.9733333333333334
Suggested SL value for the given dataset is  0.026666666666666616


In [None]:
# Benchmark Score
print("Minimum Threshold score (CL) : ", scores.min())
print("Suggested SL value for the given dataset is ", 1 - scores.min())

Minimum Threshold score (CL) :  0.9333333333333333
Suggested SL value for the given dataset is  0.06666666666666665


In [None]:
# Optimal Score
print("Possible Optimal Score is ", scores.max())

Possible Optimal Score is  1.0


## **StratifiedShuffleSplit**

In [24]:
#This method allows user to set the test_size


CL = scores.mean()

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize StratifiedShuffleSplit Cross Validation
from sklearn.model_selection import StratifiedShuffleSplit
kfold = StratifiedShuffleSplit(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              test_size=0.2,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features,label):
  tracker +=1

  X_train,X_test = features[train],features[test]
  y_train,y_test = label[train],label[test]

  model.fit(X_train,y_train)

  if model.score(X_test,y_test) >= CL:
    print("Test Score {} Train Score {} for Sample Split {}".format(model.score(X_test,y_test),model.score(X_train,y_train),tracker))



Test Score 1.0 Train Score 0.9666666666666667 for Sample Split 3


In [25]:
#Extract Best Sample

#Step1: Initialize Algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#Step2: Initialize K-Fold Cross Validation
from sklearn.model_selection import StratifiedShuffleSplit
kfold = StratifiedShuffleSplit(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              test_size=0.2,
              random_state=1) #This random state is to help me reproduce my same output

#Step3: Initialize tracker to track the best sample

tracker=0

#here split function returns the row index location of the dataset
for train,test in kfold.split(features,label):
  tracker +=1
  if tracker == 3:
    X_train,X_test,y_train,y_test=features[train],features[test],label[train],label[test]



In [26]:
len(X_train)

120

In [27]:
len(X_test)

30

In [None]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py