### Hyperparameter Tuning

Hyperparams tuning = choosing set of optimal hyperparams for a model. <br>

Hyperparam = Parameter that is used to control the learning process, is external to the model and cannot be estimated with data, for example, n_estimators for random forests <br>

Param = Model parameter that is estimated from the data, for example, model weights. <br>

#### Grid Search 
Looks through all possible combinations of hyperparams for the model. <br>

#### Randomized Search
We select a few combinations at random instead of searching through every possible combination<br>
Helps to get right params quickly but compromises on accuracy

In [14]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

data = pd.read_csv("../data/04 - decisiontreeAdultIncome.csv")
data_prep = pd.get_dummies(data,drop_first=True)
x = data_prep.iloc[:,:-1]
y = data_prep.iloc[:,-1]

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2)

In [4]:
####### Grid Search #########

from sklearn.model_selection import GridSearchCV

rfc_param = {"n_estimators":[10,15,20],
            "min_samples_split":[8,16],
            "min_samples_leaf":[1,2,3,4,5],
            }

# no. of params = 3 * 2 * 5 = 30

# Create the GridSearch object
rfc_grid = GridSearchCV(estimator=rfc, 
                       param_grid=rfc_param,
                       scoring="accuracy",
                       cv = 10,
                       return_train_score=True)

# Number of jobs = no. of params * no. of k-fold valid = 30 * 10 = 300 jobs

In [5]:
# Fit data to grid search object
rfc_grid_fit = rfc_grid.fit(x,y)

In [23]:
rfc_results = rfc_grid_fit.cv_results_

In [24]:
data = pd.DataFrame(rfc_results).iloc[:,4:]
data.head()
# std_train_score = stdev of the training score as each fold changes
# rank_test_score = rank of each of each combination based on the scoring paramater specified. "accuracy" in this case

Unnamed: 0,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,1,8,10,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.802425,0.794846,0.816574,0.794341,0.811521,0.810005,0.812532,0.817998,0.804853,0.805865,0.807096,0.007794,30,0.871687,0.873315,0.871181,0.873652,0.871968,0.871013,0.871238,0.870459,0.872873,0.872031,0.871942,0.000994
1,1,8,15,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.798383,0.793835,0.818595,0.79333,0.813542,0.81051,0.8095,0.818504,0.810415,0.804853,0.807147,0.008824,29,0.873372,0.874719,0.873428,0.875393,0.872361,0.873484,0.872866,0.872199,0.872817,0.872649,0.873329,0.000969
2,1,8,20,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.799394,0.791309,0.818595,0.791814,0.814553,0.810005,0.814047,0.819515,0.811931,0.802831,0.807399,0.009908,28,0.873091,0.875955,0.873652,0.876404,0.873596,0.873765,0.873372,0.872873,0.87394,0.874052,0.87407,0.001113
3,1,16,10,"{'min_samples_leaf': 1, 'min_samples_split': 1...",0.805457,0.795856,0.817585,0.797878,0.813037,0.81809,0.815563,0.82002,0.813953,0.811426,0.810887,0.008006,27,0.857592,0.8604,0.856525,0.859558,0.857536,0.857199,0.85675,0.856084,0.85833,0.856982,0.857696,0.001298
4,1,16,15,"{'min_samples_leaf': 1, 'min_samples_split': 1...",0.80192,0.795856,0.823143,0.792319,0.819606,0.81809,0.820616,0.818504,0.812437,0.81547,0.811796,0.010474,26,0.858715,0.861748,0.856357,0.861635,0.859333,0.857255,0.857817,0.856533,0.858779,0.859172,0.858734,0.001777


In [22]:
# Best combination of params
# data[data["rank_test_score"] == 1]
rfc_grid_fit.best_params_

{'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 10}

In [26]:
####### Randomized Search ########
from sklearn.model_selection import RandomizedSearchCV

rfc_param = {"n_estimators":[10,15,20],
            "min_samples_split":[8,16],
            "min_samples_leaf":[1,2,3,4,5],
            }

# Select random combinations from the 30 possible combinations
# n_iter specifies how many combinations we want to try
rfc_rs = RandomizedSearchCV(estimator = rfc,
                           param_distributions = rfc_param,
                           cv = 10,
                           n_iter = 10,
                           return_train_score = True,
                           random_state = 1)

In [28]:
rfc_rs_fit = rfc_rs.fit(x,y)
rfc_rs_results = rfc_rs_fit.cv_results_
cv_rs_results = pd.DataFrame(rfc_rs_results)

In [29]:
cv_rs_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.111875,0.0013,0.005601,0.00049,20,16,3,"{'n_estimators': 20, 'min_samples_split': 16, ...",0.813542,0.806973,0.819101,0.806468,0.822132,0.826175,0.814047,0.82457,0.817998,0.81092,0.816193,0.006602,6,0.845631,0.847821,0.844845,0.848607,0.84535,0.846417,0.845631,0.84491,0.848335,0.846819,0.846437,0.001331
1,0.057552,0.002518,0.003301,0.000459,10,16,4,"{'n_estimators': 10, 'min_samples_split': 16, ...",0.815563,0.806973,0.815563,0.807479,0.822638,0.82567,0.811521,0.822042,0.823054,0.808898,0.81594,0.006699,8,0.841813,0.841363,0.841027,0.843048,0.840072,0.841981,0.841307,0.840867,0.842215,0.842383,0.841608,0.000814
2,0.088204,0.000397,0.004702,0.000458,15,16,2,"{'n_estimators': 15, 'min_samples_split': 16, ...",0.817079,0.808489,0.824154,0.800404,0.822638,0.813542,0.822132,0.824065,0.822042,0.810415,0.816496,0.00765,5,0.851415,0.851247,0.850348,0.851527,0.848439,0.851247,0.85046,0.850469,0.849683,0.850357,0.850519,0.000893
3,0.083498,0.001118,0.004602,0.000491,15,8,4,"{'n_estimators': 15, 'min_samples_split': 8, '...",0.812532,0.80091,0.821122,0.810005,0.818595,0.828701,0.808994,0.827604,0.81547,0.816481,0.816041,0.008088,7,0.843048,0.84681,0.844396,0.846867,0.845463,0.844901,0.844059,0.844573,0.845022,0.845528,0.845067,0.001116
4,0.114298,0.001005,0.005401,0.00049,20,8,3,"{'n_estimators': 20, 'min_samples_split': 8, '...",0.808994,0.802425,0.820111,0.80192,0.824154,0.824154,0.817585,0.825581,0.816481,0.812437,0.815384,0.008276,9,0.850966,0.854223,0.849337,0.85383,0.852538,0.85046,0.851415,0.850862,0.851592,0.851255,0.851648,0.001422


In [30]:
rfc_rs_fit.best_params_

{'n_estimators': 10, 'min_samples_split': 8, 'min_samples_leaf': 5}