In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Parameters to tune 
# oob_score (True), n_estimators, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, 
# max_features, max_leaf_nodes,min_impurity_decrease, min_impurity_split, bootstrap

# Instructions: You need X_train and y_train. Rfr has built in cross validation, so there's no need to create
# a hold out set. You only need that when comparing to other 

# In my experience, it's beneficial to tune max depth and n_estimators first, then from there, tune the rest 
# of the hyper parameters. Test increasing n_estim

In [None]:
# RFR GridSearchCV
param_grid = {"max_depth": [11,12,13],
              "min_samples_split": [115,117,119],
              'min_samples_leaf':[47,48,49],
              'max_leaf_nodes':[4340,4346,4350]
              }
model_rfr = RandomForestRegressor(random_state=42, verbose=1)
# Perform Grid Search CV
gs_cv_rfr = GridSearchCV(model_rfr, param_grid=param_grid, cv = 3, scoring='neg_mean_squared_error',
                     verbose=10, n_jobs=-1 ).fit(X_train, y_train)

print(gs_cv_rfr.best_estimator_)
pd.DataFrame(gs_cv_rfr.cv_results_)


In [None]:
#### RFR RANDOMIZEDSEARCH ####
y_train = data.train_df[col_target]
X_train = data.train_df.drop([col_target,col_id],axis=1)
rfr = RandomForestRegressor()

n_estimators = [34,35,36]
max_depth = [12,13,14]
min_samples_split = [175,176,177]
min_samples_leaf = [76,77,78]
min_weight_fraction_leaf = [0.000092, 0.000093, 0.000094]
max_features = ['sqrt','auto']
max_leaf_nodes = [5117,5118,5119]
min_impurity_decrease = [0.0097,0.0098,0.0099]

hyperparameters = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
                       min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
                       max_features=max_features, max_leaf_nodes=max_leaf_nodes,
                       min_impurity_decrease=min_impurity_decrease
                      )

clf = RandomizedSearchCV(rfr, hyperparameters, random_state=42, cv=3, verbose=1, n_jobs=-1)
best_model = clf.fit(X_train, y_train)

# best hyper parameters
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', best_model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])
print('Best min_weight_fraction_leaf:', best_model.best_estimator_.get_params()['min_weight_fraction_leaf'])
print('Best max_features:', best_model.best_estimator_.get_params()['max_features'])
print('Best max_leaf_nodes:', best_model.best_estimator_.get_params()['max_leaf_nodes'])
print('Best min_impurity_decrease:', best_model.best_estimator_.get_params()['min_impurity_decrease'])

In [None]:
##### Manual Tuning Below #####

In [None]:
# rfrbest max_depth=13 COMPLETE
list_values = range(12,15)
hyperp = 'max_depth'
results = []
for val in list_values:
    start_time = time.time()    
    rfr = RandomForestRegressor(max_depth=val, verbose=0, n_jobs=-1, oob_score=True)
    
    rfr.fit(X_train, y_train) 
    results.append(
                   {hyperp: val,
                    'oob_score': rfr.oob_score_,
                    'minutes': (time.time() - start_time)/60
                  })
    print(pd.DataFrame(results))
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results['oob_score'].plot(title=hyperp + ' oob score')
results

In [None]:
# rfrbest n_estimators=37 COMPLETE
list_values = range(10,100,25)
hyperp = 'n_estimators'
results = []
for val in list_values:
    start_time = time.time()    
    rfr = RandomForestRegressor(n_estimators=val, max_depth=13, 
                                verbose=0, n_jobs=-1, oob_score=True)
    
    rfr.fit(X_train, y_train) 
    results.append(
                   {hyperp: val,
                    'oob_score': rfr.oob_score_,
                    'minutes': (time.time() - start_time)/60
                  })
    print(pd.DataFrame(results))
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results['oob_score'].plot(title=hyperp + ' oob score')
results

In [None]:
# rfrbest min_samples_split = [.00005,.0001,.00015] COMPLETE score=0.748418
list_values = [1e-3,1e-4,1e-4,1e-5]
list_values = [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009,0.002]
# list_values = [.00019,.0002,.000211,.00022,.00023,.00024,.00025,.00026,.00027,.00028,.00029,.0003]
# list_values = [.000211,.000212,.000213,.000214,.000215,.000216,.000217,.000218,.000219]
# list_values = [.000219,.000220,.000221,.000222,.000223,.000224,.000225,.000226,.000227]
hyperp = 'min_samples_split'
results = []
for val in list_values:
    rfr = RandomForestRegressor(min_samples_split=val, max_depth=13, n_estimators=37,
                                verbose=False, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
                   {hyperp: val,
                    'oob_score': rfr.oob_score_,
                  })
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results.plot(title=hyperp + ' oob score')
results

In [None]:
# rfrbest min_samples_split=[114,115,116] COMPLETE

list_values = range(2,150,10)
list_values = range(105,120)
hyperp = 'min_samples_split'
results = []
for val in list_values:
    rfr = RandomForestRegressor(min_samples_split=val, max_depth=13, n_estimators=37,
                                verbose=True, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
                   {hyperp: val,
                    'oob_score': rfr.oob_score_,
                  })
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results.plot(title=hyperp + ' oob score')
results

In [None]:
# rfr +maxdepth COMPLETE
# best min_samples_leaf=[47,48,49]                                                    ###
hyperp = 'min_samples_leaf'                                                 ###
list_val = [50,100,150,200,25,300,350]
list_val =  range(40,80,5)
list_val =  range(46,52)

results=[]
for val in list_val:
    start_time = time.time()
    rfr = RandomForestRegressor(min_samples_leaf=val, max_depth=13,     ###
                                verbose=True, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
        {
            hyperp: val,                                         
            'oob_score': rfr.oob_score_,
            'minutes': (time.time() - start_time)/60
        })
    print(pd.DataFrame(results))
    
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results[['oob_score']].plot(title=hyperp + ' MSE');

In [None]:
# rfr +maxdepth 
# best min_weight_fraction_leaf=                                                    ###
hyperp = 'min_weight_fraction_leaf'                                                 ###
list_val = [0,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9]
list_val = [1e-4,2e-4,3e-4,4e-4,5e-4,6e-4,7e-4,8e-4,9e-4]

results=[]
for val in list_val:
    start_time = time.time()
    rfr = RandomForestRegressor(min_weight_fraction_leaf=val, max_depth=13,     ###
                                verbose=True, random_state=42, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
        {
            hyperp: val,                                         
            'oob_score': rfr.oob_score_,
            'minutes': (time.time() - start_time)/60
        })
    print(pd.DataFrame(results))
    
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results[['oob_score']].plot(title=hyperp + ' MSE');

In [None]:
# rfr +maxdepth COMPLETE
# best max_features=default                                                    ###
hyperp = 'max_features'                                                 ###
list_val = ['auto', 'sqrt', 'log2', None]

results=[]
for val in list_val:
    start_time = time.time()
    rfr = RandomForestRegressor(max_features=val, max_depth=13,     ###
                                verbose=True, random_state=42, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
        {
            hyperp: val,                                         
            'oob_score': rfr.oob_score_,
            'minutes': (time.time() - start_time)/60
        })
    print(pd.DataFrame(results))
    
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results[['oob_score']].plot(title=hyperp + ' MSE');

In [None]:
# rfr +maxdepth 
# best max_leaf_nodes=[4340,4346,4350]                                                    ###
hyperp = 'max_leaf_nodes'                                                 ###
list_val = range(1000,6000,1000)
# list_val = range(100,1100,100)
# list_val = range(4300,4350,1)

results=[]
for val in list_val:
    start_time = time.time()
    rfr = RandomForestRegressor(max_leaf_nodes=val, max_depth=16,     ###
                                verbose=True, random_state=42, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
        {
            hyperp: val,                                         
            'oob_score': rfr.oob_score_,
            'minutes': (time.time() - start_time)/60
        })
#     print(pd.DataFrame(results))
    
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results[['oob_score']].plot(title=hyperp + ' MSE');
results
# max_depth 10 2002	0.629012

In [None]:
# rfr +maxdepth 
# best min_impurity_decrease=default                                                    ###
hyperp = 'min_impurity_decrease'                                                 ###
list_val = [0,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6]

results=[]
for val in list_val:
    start_time = time.time()
    rfr = RandomForestRegressor(min_impurity_decrease=val, max_depth=13,     ###
                                verbose=True, random_state=42, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train) 
    results.append(
        {
            hyperp: val,                                         
            'oob_score': rfr.oob_score_,
            'minutes': (time.time() - start_time)/60
        })
    print(pd.DataFrame(results))
    
results = pd.DataFrame(results).set_index(hyperp).sort_index()
results[['oob_score']].plot(title=hyperp + ' MSE');