The following code finds a random tree with optimized parameters using the random forest method. The code was greatly influenced by this article: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 

In [1]:
import pandas as pd
import sklearn.model_selection as ms
import sklearn
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv("workable_df.csv")

In [3]:
#Only use trips that last less than 1 hour.
df = df[df['trip_duration']<3600]
#Cast categorical variables to strings.
df.gender = df['gender'].apply(str)
df.user_type = df['user_type'].apply(str)
df.start_station_id = df['start_station_id'].apply(str)
#Median sale price has lots of N/A values. Remove this column.
df = df.drop('median_sale_price', axis = 1)
#Remove the few rows with N/A values. 
df = df.dropna(axis=0)
df.dtypes

trip_duration                      int64
start_month                        int64
start_hour                         int64
start_station_id                  object
user_type                         object
birth_year                         int64
gender                            object
total_precipitation_inches       float64
average_temperature_farenheit    float64
total_snowfall_inches            float64
median_rental_price              float64
week_day                          object
start_hour_sq                      int64
start_month_sq                     int64
dtype: object

In [4]:
#One hot encode dataset. 
df = pd.get_dummies(df,drop_first=True)

In [5]:
#Split data into test and training sets.
y = df['trip_duration']
X = df.drop('trip_duration', axis = 1)
feature_names = X.columns
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=74)

At first, a random forest with a limited number of n_estimators, and max_depth was ran to determine if bootstrap should be true or false and if max_features should be 'sqrt' or 'auto.' Once it was determined that it was likely that bootstrap = true and max_features = 'sqrt' is more optimal, the following forest was ran:


In [15]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 22)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [16]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)

In [17]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 94.5min remaining: 94.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 270.9min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=2, n_jobs=-1,
          param_distributions={'n_estimators': [200, 218, 236, 254, 272, 290, 309, 327, 345, 363, 381, 400, 418, 436, 454, 472, 490, 509, 527, 545, 563, 581, 600, 618, 636, 654, 672, 690, 709, 727, 745, 763, 781, 800, 818, 836, 854, 872, 890, 909, 927, 945, 963, 981, 1000, 1018, 1036, 1054, 1072, 1090, 1109, ...05, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scorin

In [18]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1927}

In [20]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1927}

In [22]:
newY = list(rf_random.predict(X_test))

The following is the mean absolute error of the model in minutes:

In [24]:
sum(abs(newY-y_test)) / len(y_test) / 60

6.5423852417092112