In [1]:
import numpy as np
import pandas as pd

## Hyperparameter to play with

1. **n_estimator** 

sets the number of decision trees to be used in the forest.

**`[100, 120, 300, 500, 800, 1200]`**

2. **max_depth**  

set the max depth of the tree.

If not set then there is no cap. The tree will keep expanding until all leaves are pure.

Limiting the depth is good for pruning trees to prevent over-fitting on noisy data.

**`[5, 8, 15, 25, 30, None]`**

3. **max_features**

set the number of features to consider for the best node split

Default is “auto”, which means that the square root of the number of features is used for every split in the tree.

“None” means that all features are used for each split.

Each decision tree in the random forest will typically use a random subset of features for splitting.

**`[log2, sqrt, auto, None]`**

4. **min_samples_split**

The minimum number of samples needed before a split (differentiation) is made in an internal node

**`[1,2,5,10,15,100]`**

5. **min_samples_leafs**

The minimum number of samples needed to create a leaf (decision) node.

Default is 1. This means that a split point at any depth will only be allowed if there is at least 1 sample for each path.

**`[1,2,5,10]`**

In [2]:
def evaluate_preds(y_true, y_preds):
    r2 = r2_score(y_true, y_preds)
    MAE = mean_absolute_error(y_true, y_preds)
    MSE = mean_squared_error(y_true, y_preds)
    metric_dict = {"r2_score": round(r2, 2),
                   "mean_absolute_error": round(MAE,2),
                   "mean_squared_error": round(MSE,2)}
    print(f"r2_score: {r2 * 100:2f}%")
    print(f"mean_absolute_error: {MAE:.2f}")
    print(f"mean_squared_error: {MSE:.2f}")
    
    return metric_dict

In [3]:
# imports boston dataset
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

boston = load_boston()
bostondf = pd.DataFrame(boston.data)
bostondf.columns = boston.feature_names
bostondf['PRICE'] = boston.target

bostondf.to_csv("boston.csv", index=False,)

In [4]:
# separating X features and y target
bostondf = pd.read_csv('boston.csv')
X = bostondf.drop(columns='PRICE')
y = bostondf['PRICE']

In [5]:
from sklearn.model_selection import train_test_split

# generate seed
np.random.seed(42)

# splitting to training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.ensemble import RandomForestRegressor

# instantiating the model
reg = RandomForestRegressor()

# fitting and scoring the data
reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.873969014117403

In [6]:
reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
y_pred = reg.predict(X_test)
evaluate_preds(y_test, y_pred)

r2_score: 87.396901%
mean_absolute_error: 2.12
mean_squared_error: 9.24


{'r2_score': 0.87, 'mean_absolute_error': 2.12, 'mean_squared_error': 9.24}

In [8]:
np.random.seed(42)
from sklearn.model_selection import RandomizedSearchCV
param_grid = {"n_estimators":[100,500,1000],
              "max_depth":[5,8,15,25,None],
              "max_features":["auto"],
              "min_samples_split":[2,4,5]              
}
# n_jobs=1 ; pertains to the amount of core processor to be used
reg = RandomForestRegressor(n_jobs=1)

# setup RandomizedSearchCV
rs_reg = RandomizedSearchCV(estimator=reg,
                            param_distributions=param_grid,
                            n_iter=10,
                            cv=5,
                            verbose=2)

# fit randomized search to the training data
rs_reg.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None, total=   0.3s
[CV] n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None, total=   0.3s
[CV] n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None 
[CV]  n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None, total=   0.3s
[CV] n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None 
[CV]  n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None, total=   0.3s
[CV] n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None 
[CV]  n_estimators=100, min_samples_split=4, max_features=auto, max_depth=None, total=   0.3s
[CV] n_estimators=500, min_samples_split=5, max_features=auto, max_depth=15 
[CV]  n_estimators=500, min_samples_split=5, max_features=auto, max_depth=15, total=   1.4s
[CV] n_estimators=500, min_samples_split=5, max_features=auto, max_depth=15 
[CV]  n_estimators=500, min_samples_split=5, max_features=auto, max_depth=15, total=   1.3s
[CV] n_estimators=500, min_samples_split=5, max_f

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.2min finished


In [9]:
rs_reg.best_params_

{'n_estimators': 1000,
 'min_samples_split': 4,
 'max_features': 'auto',
 'max_depth': None}

In [10]:
rs_reg.score(X_test,y_test)

0.8812693028654112

In [11]:
rs_y_preds = rs_reg.predict(X_test)
rs_metrics = evaluate_preds(y_test, rs_y_preds)

r2_score: 88.126930%
mean_absolute_error: 2.08
mean_squared_error: 8.71


## Putting it all together with Pipeline

In [12]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

# generate seed
np.random.seed(42)

# import dataset
boston_df = pd.read_csv("boston.csv")

# create a pipeline
model = Pipeline(steps=[("model", RandomForestRegressor())])

X = boston_df.drop(columns="PRICE")
y = boston_df["PRICE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.873969014117403

In [13]:
pipe_grid = {"model__n_estimators":[100,500,1000],
             "model__max_depth":[5,8,15,25,None],
             "model__max_features":["auto"],
             "model__min_samples_split":[2,4,5],         
}
gs_model = RandomizedSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None, total=   2.6s
[CV] model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s


[CV]  model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None, total=   2.8s
[CV] model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None, total=   2.9s
[CV] model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None, total=   3.0s
[CV] model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=1000, model__min_samples_split=4, model__max_features=auto, model__max_depth=None, total=   2.8s
[CV] model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=5 
[CV]  model__n_estimators=500, model__min_samples_split=5, model__max_features=a

[CV]  model__n_estimators=500, model__min_samples_split=2, model__max_features=auto, model__max_depth=8, total=   1.3s
[CV] model__n_estimators=500, model__min_samples_split=2, model__max_features=auto, model__max_depth=8 
[CV]  model__n_estimators=500, model__min_samples_split=2, model__max_features=auto, model__max_depth=8, total=   1.2s
[CV] model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=None, total=   1.5s
[CV] model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=None, total=   1.5s
[CV] model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__max_depth=None 
[CV]  model__n_estimators=500, model__min_samples_split=5, model__max_features=auto, model__m

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('model',
                                              RandomForestRegressor(bootstrap=True,
                                                                    ccp_alpha=0.0,
                                                                    criterion='mse',
                                                                    max_depth=None,
                                                                    max_features='auto',
                                                                    max_leaf_nodes=None,
                                                                    max_samples=None,
                                                                    min_impurity_decrease=0.0,
                                                                    min_impurity_split=None,
                                                                

In [14]:
gs_model.score(X_test,y_test)

0.8801036057929125

In [15]:
evaluate_preds(y_test, y_pred)

r2_score: 87.396901%
mean_absolute_error: 2.12
mean_squared_error: 9.24


{'r2_score': 0.87, 'mean_absolute_error': 2.12, 'mean_squared_error': 9.24}

## Saving and loading model

In [16]:
from joblib import dump, load

# save model to file
dump(gs_model, filename="bostondf_rgs_regression.joblib")

['bostondf_rgs_regression.joblib']

In [17]:
# import joblib model
loaded_job_model = load(filename="bostondf_rgs_regression.joblib")

In [18]:
joblib_y_preds = loaded_job_model.predict(X_test)
evaluate_preds(y_test,joblib_y_preds)

r2_score: 88.010361%
mean_absolute_error: 2.10
mean_squared_error: 8.79


{'r2_score': 0.88, 'mean_absolute_error': 2.1, 'mean_squared_error': 8.79}

### Sources and References:

https://towardsdatascience.com/hyper-parameter-tuning-and-model-selection-like-a-movie-star-a884b8ee8d68

https://www.udemy.com/course/complete-machine-learning-and-data-science-zero-to-mastery/

https://www.kaggle.com/ronitf/heart-disease-uci