# Hyper Parameter Optimization



1. GridSearchCV
2. RandomizedSearchCV
3. Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)
4. Sequential Model Based Optimization(Tuning a scikit-learn estimator with skopt)
5. Optuna- Automate Hyperparameter Tuning
6. Genetic Algorithms (TPOT Classifier)


In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('/Users/kushagragahlaut/Desktop/code/diabetes2.csv')
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [13]:
df[df['Glucose']==0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
75,1,0,48,20,0,24.7,0.14,22,0
182,1,0,74,20,23,27.7,0.299,21,0
342,1,0,68,35,0,32.0,0.389,22,0
349,5,0,80,32,0,41.0,0.346,37,1
502,6,0,68,41,0,39.0,0.727,41,1


In [24]:
df['Glucose']= np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']= np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])

## Model creation

In [25]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

In [30]:
print(X.head())
print(y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0             72             35     30.5  33.6   
1            1     85.0             66             29     30.5  26.6   
2            8    183.0             64              0     30.5  23.3   
3            1     89.0             66             23     94.0  28.1   
4            0    137.0             40             35    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state= 33)

In [35]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction = rf_classifier.predict(X_test)

In [37]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(confusion_matrix(y_test,prediction))
print('\n',accuracy_score(y_test,prediction))
print('\n',classification_report(y_test,prediction))

[[88 11]
 [26 29]]

 0.7597402597402597

               precision    recall  f1-score   support

           0       0.77      0.89      0.83        99
           1       0.72      0.53      0.61        55

    accuracy                           0.76       154
   macro avg       0.75      0.71      0.72       154
weighted avg       0.76      0.76      0.75       154



### Manual Hyperparameter tuning

In [42]:
#not efficient
model = RandomForestClassifier(n_estimators=300, criterion='entropy', max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
prediction = rf_classifier.predict(X_test)
print(confusion_matrix(y_test,prediction))
print('\n',accuracy_score(y_test,prediction))
print('\n',classification_report(y_test,prediction))

[[88 11]
 [26 29]]

 0.7597402597402597

               precision    recall  f1-score   support

           0       0.77      0.89      0.83        99
           1       0.72      0.53      0.61        55

    accuracy                           0.76       154
   macro avg       0.75      0.71      0.72       154
weighted avg       0.76      0.76      0.75       154



## Randomized Search CV

In [44]:
from sklearn.model_selection import RandomizedSearchCV

#number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]

#number of features to consider
max_features = ['sqrt','auto','log2']

#max level of levels in trees
max_depth = [int(x) for x in np.linspace(10, 1000,10)]

# Minimum number of samples required to split a node
min_samples_split = [1,3,5,7,9,11]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]

#create a random grid
random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'criterion': ['gini','entropy']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['sqrt', 'auto', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 5, 7, 9, 11], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['gini', 'entropy']}


In [46]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,
                                 verbose=2,random_state=100,n_jobs=-1)
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['sqrt', 'auto',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 3, 5, 7, 9,
                                                              11],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   ra

In [47]:
rf_randomcv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 10,
 'criterion': 'gini'}

In [48]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=8,
                       min_samples_split=9, n_estimators=2000)

In [49]:
best_random_grid = rf_randomcv.best_estimator_

In [51]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
y_pred = best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('\n',accuracy_score(y_test,y_pred))
print('\n',classification_report(y_test,y_pred))

[[85 14]
 [27 28]]

 0.7337662337662337

               precision    recall  f1-score   support

           0       0.76      0.86      0.81        99
           1       0.67      0.51      0.58        55

    accuracy                           0.73       154
   macro avg       0.71      0.68      0.69       154
weighted avg       0.73      0.73      0.72       154



### Grid Search CV

In [52]:
rf_randomcv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 10,
 'criterion': 'gini'}

In [58]:
from sklearn.model_selection import GridSearchCV

param_grid= {
    'criterion' : [rf_randomcv.best_params_['criterion']],
    'max_depth' : [rf_randomcv.best_params_['max_depth']],
    'max_features':[rf_randomcv.best_params_['max_features']],
    'min_samples_leaf':[rf_randomcv.best_params_['min_samples_leaf'],
                        rf_randomcv.best_params_['min_samples_leaf']+2,
                        rf_randomcv.best_params_['min_samples_leaf']-2],
                       
    'min_samples_split':[rf_randomcv.best_params_['min_samples_split'],
                         rf_randomcv.best_params_['min_samples_split']-2,
                         rf_randomcv.best_params_['min_samples_split']-3,
                         rf_randomcv.best_params_['min_samples_split']+1],
                                               
    'n_estimators' : [rf_randomcv.best_params_['n_estimators'],
                       rf_randomcv.best_params_['n_estimators']-400, rf_randomcv.best_params_['n_estimators']-800,
                        rf_randomcv.best_params_['n_estimators']-1200, rf_randomcv.best_params_['n_estimators']-1600,
                        rf_randomcv.best_params_['n_estimators']-2000]
}
print(param_grid)

{'criterion': ['gini'], 'max_depth': [10], 'max_features': ['log2'], 'min_samples_leaf': [8, 10, 6], 'min_samples_split': [9, 7, 6, 10], 'n_estimators': [2000, 1600, 1200, 800, 400, 0]}


In [60]:
#fit the grid search to the data
rf= RandomForestClassifier()
gscv = GridSearchCV(estimator=rf, param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
gscv.fit(X_train,y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [10],
                         'max_features': ['log2'],
                         'min_samples_leaf': [8, 10, 6],
                         'min_samples_split': [9, 7, 6, 10],
                         'n_estimators': [2000, 1600, 1200, 800, 400, 0]},
             verbose=2)

In [62]:
gscv.best_estimator_

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=8,
                       min_samples_split=7, n_estimators=800)

In [63]:
best_grid=gscv.best_estimator_

In [64]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
y_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('\n',accuracy_score(y_test,y_pred))
print('\n',classification_report(y_test,y_pred))

[[86 13]
 [27 28]]

 0.7402597402597403

               precision    recall  f1-score   support

           0       0.76      0.87      0.81        99
           1       0.68      0.51      0.58        55

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154



### Automated Hyperparameter Tuning

In [66]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [67]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),  #choose one
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),     #choose int values between them
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),   #choose float values between them
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [68]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x1360c4da0>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x1360c4f98>,
 'max_features': <hyperopt.pyll.base.Apply at 0x1360cd0f0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x1360cd320>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x1360cd470>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1360cd5c0>}

In [70]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()     #mean of accuracy value after 5 iterations

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [71]:
#minimizing the function value
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [10:02<00:00,  7.53s/trial, best loss: -0.7752898840463814]


{'criterion': 1,
 'max_depth': 1110.0,
 'max_features': 2,
 'min_samples_leaf': 0.03776166443665809,
 'min_samples_split': 0.07106349777929073,
 'n_estimators': 3}

In [74]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
log2
750


In [75]:
best['min_samples_leaf']

0.03776166443665809

In [76]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[88 11]
 [29 26]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.75      0.89      0.81        99
           1       0.70      0.47      0.57        55

    accuracy                           0.74       154
   macro avg       0.73      0.68      0.69       154
weighted avg       0.73      0.74      0.73       154



### Genetic Algorithms

In [77]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [1]:
from tpot import TPOTClassifier


ImportError: cannot import name 'available_if'

In [2]:
import tpot

print(tpot.__version__)


ImportError: cannot import name 'available_if'