In [None]:
#
#
# Cross Validation
#====================
# We can find the practical thresholds (CL,SL) for the given dataset using cross_val_score
# We can find the best samples that may achieve the identified thresholds using
# KFold - Applicable for Regression and Classification
# StratifiedShuffleSplit - Applicable for Classification (test_size)
# ShuffleSplit - Applicable for Regression (test_size)
#
#

In [None]:
#What is Hyperparameter?
#
# They are function parameters that we set in the MODEL CONSTRUCTOR durig the initialization phase.
#
# The goal of Hyperparameter  tuning is to identify the best hyperparamters's value that can achieve an optimal model

In [1]:
import numpy as np
import pandas as pd

In [2]:
data= pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [3]:
features = data.iloc[:,[0,1,2,3]].values
label = data.iloc[:,[4]].values

**Guideline **

Cross-validation is applied on the entire dataset


1.   Get the minimum score thresold
2.   To understand the optimal score you can achieve from the dataset
3.   To extract the best training sample that can give optimal score



In [4]:

import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

modelAlgo= KNeighborsClassifier()

scores = cross_val_score(modelAlgo,
                         features,
                         label,
                         cv=5) #cv value can be any natural number. However most used values are 5 and 10

scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [5]:
#Threshold
scores.mean()

0.9733333333333334

In [None]:
# Goal : To identify the best Hyperparameter combinations for KNN that can give me score of > 0.97

In [None]:
'''
KNeighborsClassifier(
n_neighbors=5, ------------ Postive Integers
weights='uniform',  ------- ‘uniform’, ‘distance’
algorithm='auto', --------- ‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’
leaf_size=30, ------------- Positive Integers
p=2,  --------------------- 1,2 (When metric is minkowski else this parameter has no usage)
metric='minkowski', ------- 'minkowski'‘cityblock’‘cosine’'euclidean’‘haversine’‘l1’‘l2’‘manhattan’‘nan_euclidean’

)
'''

In [6]:
#Method1: GridSearch


#Step1: Create parameter list and parameter grid

weightParameter = ['uniform','distance']
n_neighborsParameter = np.arange(1,31)
algorithmParameter = ['auto', 'ball_tree', 'kd_tree', 'brute']
metricParameter = ['minkowski','cityblock','cosine','euclidean','haversine','l1','l2','manhattan','nan_euclidean']
pParameter=np.arange(1,3)


#For paramGrid, key=parameterName value=validParameterArguments

paramGrid = dict(n_neighbors=n_neighborsParameter,
                 weights=weightParameter,
                 algorithm=algorithmParameter,
                 p=pParameter,
                 metric=metricParameter)


#Step2: initialize algo

from sklearn.neighbors import KNeighborsClassifier
modelGridSearch = KNeighborsClassifier()


#Step3: Perform Grid Search

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(modelGridSearch,
                    param_grid=paramGrid,
                    cv=5)
                    #verbose=3)

#Step4: Execute grid search

grid.fit(features,label)

In [7]:
grid.best_score_

0.9866666666666667

In [8]:
grid.best_estimator_

In [10]:
finalModel = KNeighborsClassifier(algorithm='auto', metric='minkowski', n_neighbors=10, p=2, weights='distance')
finalModel.fit(features,label)

In [11]:
finalModel.score(features,label)

1.0

In [12]:
grid.best_params_

{'algorithm': 'auto',
 'metric': 'minkowski',
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [13]:
# Randomized Search Method --- A highlevel search would be done.

#Method2: RandomizedSearch


#Step1: Create parameter list and parameter grid

weightParameter = ['uniform','distance']
n_neighborsParameter = np.arange(1,31)
algorithmParameter = ['auto', 'ball_tree', 'kd_tree', 'brute']
metricParameter = ['minkowski','cityblock','cosine','euclidean','haversine','l1','l2','manhattan','nan_euclidean']
pParameter=np.arange(1,3)




paramGrid = dict(n_neighbors=n_neighborsParameter,
                 weights=weightParameter,
                 algorithm=algorithmParameter,
                 p=pParameter,
                 metric=metricParameter)


#Step2: initialize algo

from sklearn.neighbors import KNeighborsClassifier
modelGridSearch = KNeighborsClassifier()


#Step3: Perform randomized Search

from sklearn.model_selection import RandomizedSearchCV

grid = RandomizedSearchCV(modelGridSearch,
                    param_distributions=paramGrid,
                    cv=5)

#Step4: Execute grid search

grid.fit(features,label)

In [14]:
grid.best_score_

0.9733333333333334

In [15]:
grid.best_params_

{'weights': 'uniform',
 'p': 1,
 'n_neighbors': 13,
 'metric': 'euclidean',
 'algorithm': 'ball_tree'}

In [None]:
# 1. Get the thresholds using cross_val_score
# 2. Get the best score using RandomizedSearch Method. If the best score surpasses my threshold, extract best params
#    and create model
# 3. If not, go for Grid Search and get the best score. If score greater than my threshold, extract best params
#    and create model.
# 4. If not, use my random_state technique to search for best score. If score greater than my threshold, create model
# 5. If not, Change algo and repeat all above four steps.
# 6. If not, add more data and repeat all 5 steps
# 7. If not, declare data as UNFIT and get your cheque :)

In [1]:
#  for each algo in default mode:
#        cross val score
#  Get the list of all values and print max


# thresholds=max(value)

# collection of list of ML algos
# collection of each algo parameters
# iterate:
#        gridSearch(each ML)
#        break if thresholds reaches
# else:
#       data is unfit

In [None]:
#