In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Import WoE dataset

In [2]:
df = pd.read_csv("./data/woe_df.csv")

# Split dataset

In [3]:
y = df["TARGET"]
X = df.drop(["TARGET"], axis=1)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, random_state=123, stratify=y, test_size=0.3
)

# Random Forest

In [10]:
max_depth = [int(x) for x in np.linspace(1, 10, num=10)]
n_estimators = [int(x) for x in np.linspace(start=600, stop=700, num=10)]
# Create the random grid
random_grid = {"n_estimators": n_estimators, "max_depth": max_depth}

print(random_grid)

{'n_estimators': [600, 611, 622, 633, 644, 655, 666, 677, 688, 700], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state=123)
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    scoring="accuracy",
    cv=3,
    verbose=2,
    random_state=123,
    n_jobs=-1,
)

# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=123),
                   n_jobs=-1,
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10],
                                        'n_estimators': [600, 611, 622, 633,
                                                         644, 655, 666, 677,
                                                         688, 700]},
                   random_state=123, scoring='accuracy', verbose=2)

In [12]:
rf_random.best_params_

{'n_estimators': 688, 'max_depth': 1}

In [13]:
rf_random.cv_results_

{'mean_fit_time': array([ 94.16546122, 529.54444949, 589.47017217, 257.62261136,
        479.26249321,  91.00634034,  99.46686538, 368.05287552,
        394.96002007,  68.91786822]),
 'std_fit_time': array([ 0.59810739, 10.94101483, 11.1619623 ,  5.8201296 , 17.88378126,
         5.58721241,  1.27473886,  3.10476695, 38.46927719, 11.46439463]),
 'mean_score_time': array([ 3.33639661, 12.98522822, 14.10733398,  7.10561601, 10.15225061,
         3.77348073,  3.77138186,  7.18365097,  7.45250893,  3.37492657]),
 'std_score_time': array([0.08277075, 1.31629274, 0.49137881, 0.5748234 , 1.20791991,
        0.4358303 , 0.14465573, 0.47802711, 1.3186248 , 0.20427277]),
 'param_n_estimators': masked_array(data=[688, 600, 622, 688, 633, 600, 655, 600, 611, 644],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[1, 8, 9, 3, 7, 1, 1, 6, 9, 1],
    