## Hyperparameter Tuning

In [2]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

In [5]:
rf=RandomForestClassifier() #check the parameters

In [6]:
rfr=RandomForestRegressor() ##check the parameters

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import pandas as pd
df=pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
import numpy as np
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [11]:
X=df.drop('Outcome',axis=1)
y=df['Outcome']

In [12]:
print(X.head())
print(y.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0             72           35.0     30.5  33.6   
1            1     85.0             66           29.0     30.5  26.6   
2            8    183.0             64           23.0     30.5  23.3   
3            1     89.0             66           23.0     94.0  28.1   
4            0    137.0             40           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=100)


In [20]:
rf_classifier=RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction=rf_classifier.predict(X_test)

In [21]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [22]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[83 18]
 [27 26]]
0.7077922077922078
              precision    recall  f1-score   support

           0       0.75      0.82      0.79       101
           1       0.59      0.49      0.54        53

    accuracy                           0.71       154
   macro avg       0.67      0.66      0.66       154
weighted avg       0.70      0.71      0.70       154



In [29]:
model=RandomForestClassifier(n_estimators=250,criterion='gini',max_features='sqrt',
                             min_samples_leaf=10,random_state=100).fit(X_train,y_train)
prediction=model.predict(X_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[84 17]
 [27 26]]
0.7142857142857143
              precision    recall  f1-score   support

           0       0.76      0.83      0.79       101
           1       0.60      0.49      0.54        53

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.71       154



## Randomized SerarchCV

In [34]:
from sklearn.model_selection import RandomizedSearchCV

n_estimater=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]

max_feature=['auto','sqrt','log2'] #Number of feature consider at every split

max_depth=[int(x) for x in np.linspace(10,1000,10)]

min_samples_split=[2,5,6,8,10]

min_sample_leaf=[1,2,3,4,5,6] #Minimum sample required for each split

random_grid={
    'n_estimators':n_estimater,
    'max_features':max_feature,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_sample_leaf,
    'criterion':['entropy','gini'] 
}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 6, 8, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6], 'criterion': ['entropy', 'gini']}


In [36]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                              random_state=20,n_jobs=-1)
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                                        'min_samples_split': [2, 5, 6, 8, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=20, verbose=2)

In [38]:
rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 230,
 'criterion': 'gini'}

In [39]:
best_random_grid=rf_randomcv.best_estimator_

In [40]:
y_predict=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))

[[83 18]
 [25 28]]
0.7207792207792207
              precision    recall  f1-score   support

           0       0.77      0.82      0.79       101
           1       0.61      0.53      0.57        53

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.71      0.72      0.72       154



## Grid SearchCV

In [42]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'criterion':[rf_randomcv.best_params_['criterion']],
    'n_estimators':[rf_randomcv.best_params_['n_estimators'],
                        rf_randomcv.best_params_['n_estimators']+200,
                        rf_randomcv.best_params_['n_estimators']-200,
                        rf_randomcv.best_params_['n_estimators']+400,
                        rf_randomcv.best_params_['n_estimators']-400,
                        rf_randomcv.best_params_['n_estimators']+600],
    'max_features':[rf_randomcv.best_params_['max_features']],
    'max_depth':[rf_randomcv.best_params_['max_depth']],
    'min_samples_split':[rf_randomcv.best_params_['min_samples_split'],
                        rf_randomcv.best_params_['min_samples_split']+2,
                        rf_randomcv.best_params_['min_samples_split']+4,
                        rf_randomcv.best_params_['min_samples_split']+6],
    'min_samples_leaf':[rf_randomcv.best_params_['min_samples_leaf'],
                        rf_randomcv.best_params_['min_samples_leaf']+2,
                        rf_randomcv.best_params_['min_samples_leaf']+4,
                        rf_randomcv.best_params_['min_samples_leaf']+6]
    
}

print(param_grid)

{'criterion': ['gini'], 'n_estimators': [600, 800, 400, 1000, 200, 1200], 'max_features': ['log2'], 'max_depth': [230], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 3, 5, 7]}


In [43]:
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  6.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [230],
                         'max_features': ['log2'],
                         'min_samples_leaf': [1, 3, 5, 7],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [600, 800, 400, 1000, 200, 1200]},
             verbose=2)

In [46]:
best_grid=grid_search.best_estimator_
best_grid

RandomForestClassifier(max_depth=230, max_features='log2', min_samples_leaf=7,
                       n_estimators=200)

In [47]:
y_predict=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))

[[82 19]
 [25 28]]
0.7142857142857143
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       101
           1       0.60      0.53      0.56        53

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154



 ## Automated Hyperparameter Tuning
     - Bayesain Optimization
     - Gredient Descent
     - Evolutionary Algorithams
 

In [48]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials 

In [63]:
space={
    'criterion':hp.choice('criterion',['entropy','gini']),
    'max_depth':hp.quniform('max_depth',10,1200,10) ,
    'max_features':hp.choice('max_features',['auto','sqrt','log2',None]),
    'min_sample_leaf':hp.uniform('min_sample_leaf',0,0.5),
    'min_sample_split':hp.uniform('min_sample_split',0,1),
    'n_estimators':hp.choice('n_estimators',[10,50,300,600,1200,1500])
    
}

In [64]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x7ff2c78ad350>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7ff2b7b48e90>,
 'max_features': <hyperopt.pyll.base.Apply at 0x7ff2b7b48690>,
 'min_sample_leaf': <hyperopt.pyll.base.Apply at 0x7ff2b61f8d10>,
 'min_sample_split': <hyperopt.pyll.base.Apply at 0x7ff2c79c5f10>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7ff2c79c50d0>}

In [65]:
from sklearn.model_selection import cross_val_score

In [72]:
def objective(space):
    model=RandomForestClassifier(criterion=space['criterion'],
                                max_depth=space['max_depth'],
                                max_features=space['max_features'],
                                min_samples_leaf=space['min_sample_leaf'],
                                min_samples_split=space['min_sample_split'],
                                n_estimators=space['n_estimators'],
                                 n_jobs=-1
                                )
    accuracy=cross_val_score(model,X_train,y_train,cv=5).mean()
    
    return {'loss':-accuracy,'status':STATUS_OK}

In [73]:
trails=Trials()
best=fmin(fn=objective,
         space=space,
         algo=tpe.suggest,
         max_evals=80,
         trials=trails)
best

100%|██████████| 80/80 [07:49<00:00,  5.87s/trial, best loss: -0.7801012928162068]


{'criterion': 0,
 'max_depth': 770.0,
 'max_features': 3,
 'min_sample_leaf': 0.009139678574712495,
 'min_sample_split': 0.09547728341668901,
 'n_estimators': 2}

In [74]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 600, 4: 1200,5:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

entropy
None
300


In [79]:
best['min_sample_leaf']

0.009139678574712495

In [82]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_sample_leaf'], 
                                       min_samples_split = best['min_sample_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[82 19]
 [26 27]]
0.7077922077922078
              precision    recall  f1-score   support

           0       0.76      0.81      0.78       101
           1       0.59      0.51      0.55        53

    accuracy                           0.71       154
   macro avg       0.67      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

