In [75]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [76]:
trips_df = pd.read_pickle('../00_data/trips_hourly_selected.pkl')

In [77]:
X = trips_df.drop(['starting_trips'], axis=1)
y = trips_df['starting_trips']

In [78]:
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

#### Hyperparameter Grid Search
##### `max_features`
`max_features` is the number of features to consider when looking for the best
split during a tree's growth.  

##### `min_samples_leaf`
`min_samples_leaf` is the minimum number of samples required to be at a leaf
node. If a leaf has less samples than this, the corresponding split will be
removed.  

##### `min_samples_split`
`min_samples_split` is the minimum number of samples required to split a leaf
node. If a node has less samples than this, it is automatically a leaf node.  

##### `max_depth`
`max_depth` is the maximum depth of a tree.  

##### `max_leaf_nodes`
`max_leaf_nodes` is the maximum number of leaf nodes. As soon as a tree reaches
this number, it is not expanded further.  

We will not use `n_estimators` in the grid search, because we expect that model
performance simply improves with more trees. Including that parameter in a grid
search would simply yield the highest value. Therefore we will fix it to 100.  

In [79]:
estimator = RandomForestRegressor(n_estimators=100, bootstrap=True, random_state=4711)
param_grid = {
	'max_features': ['auto', 'sqrt', 'log2'],
	'min_samples_leaf': [1, 2, 4, 8],
	'min_samples_split': [2, 4, 8],
	'max_depth': [None, 10, 50, 100],
	'max_leaf_nodes': [None, 10, 50, 100],
}

In [80]:
model = GridSearchCV(
    estimator, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1 , verbose=1
)
model.fit(X_train, y_train)


Fitting 3 folds for each of 576 candidates, totalling 1728 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=4711),
             n_jobs=-1,
             param_grid={'max_depth': [None, 10, 50, 100],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [None, 10, 50, 100],
                         'min_samples_leaf': [1, 2, 4, 8],
                         'min_samples_split': [2, 4, 8]},
             scoring='neg_mean_squared_error', verbose=1)

In [81]:
model.best_params_

{'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 2,
 'min_samples_split': 8}

As we can see when looking at `max_depth`, `max_leaf_nodes` and
`min_samples_leaf`, the hyperparameter search yielded the parameters that are
the least "constraining". We think that these parameters mainly have two purposes:
increase fitting speed and decrease overfitting. As we already used a random
forest to reduce overfitting, the grid search "set" these parameter to be the
least constraining.  
The only exception here is `min_samples_leaf`, which is set to 2, although the
minimum value for it is 1.  
`max_features` is set to the square root of the number of features, which is
the default and most common value for it.


In [82]:
best_model = model.best_estimator_

In [83]:
# evaluate the model
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")

MSE: 76.18
MAE: 5.71
