In [86]:
import pandas as pd
import sklearn.model_selection as ms
import sklearn
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold
import pydot
import os
from sklearn.tree import export_graphviz
import six
from sklearn import tree

In [87]:
df = pd.read_csv("workable_df.csv")

In [88]:
#Only use trips that last less than 1 hour.
df = df[df['trip_duration']<3600]
#Cast categorical variables to strings.
df.gender = df['gender'].apply(str)
df.user_type = df['user_type'].apply(str)
df.start_station_id = df['start_station_id'].apply(str)
#Median sale price has lots of N/A values. Remove this column.
df = df.drop('median_sale_price', axis = 1)
#Remove the few rows that still have N/A values. 
df = df.dropna(axis=0)
# Confirm types are as desired.
df.dtypes

trip_duration                      int64
start_month                        int64
start_hour                         int64
start_station_id                  object
user_type                         object
birth_year                         int64
gender                            object
total_precipitation_inches       float64
average_temperature_farenheit    float64
total_snowfall_inches            float64
median_rental_price              float64
week_day                          object
start_hour_sq                      int64
start_month_sq                     int64
dtype: object

In [90]:
#One hot encode dataset. 
df = pd.get_dummies(df,drop_first=True)

In [91]:
#Split data into test and training sets.
y = df['trip_duration']
X = df.drop('trip_duration', axis = 1)
feature_names = X.columns
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=74)

In [92]:
#Fit initial random forest model
rf = RandomForestRegressor(n_estimators = 3, random_state = 32)
#Train the model on training data
rf.fit(X_train, y_train);
#Find predicted trip durations in test set.
y_pred = list(rf.predict(X_test))
#MAE
sum(abs(y_pred-y_test)) / len(y_test)

444.01133646475245

In [17]:
#Show feature importances for manual feature selection.
fi_df = pd.DataFrame(list(zip(feature_names, rf.feature_importances_)))

In [18]:
fi_df.sort_values(by = 1, ascending = False)

Unnamed: 0,0,1
2,birth_year,1.114142e-01
4,average_temperature_farenheit,1.100218e-01
747,"user_type_""Subscriber""",8.574517e-02
6,median_rental_price,5.123329e-02
1,start_hour,4.706160e-02
7,start_hour_sq,4.683126e-02
3,total_precipitation_inches,4.600186e-02
0,start_month,2.185305e-02
8,start_month_sq,2.163896e-02
754,week_day_Tuesday,1.054175e-02


In [19]:
#Try model only with features with importance scores of > .01
X2 = df[['birth_year', 'average_temperature_farenheit', 'user_type_"Subscriber"', 'median_rental_price', 'start_hour', 'total_precipitation_inches', 'start_month']]
feature_names = X2.columns
X_train, X_test, y_train, y_test = ms.train_test_split(X2, y, test_size=0.2, random_state=74)

In [20]:
#Fit random forest model. Increase number of estimaters. 
rf2 = RandomForestRegressor(n_estimators = 50, random_state = 32)
#Train the model on training data
rf2.fit(X_train, y_train);
#Find predicted trip durations in test set.
y_pred = list(rf2.predict(X_test))
#MAE
sum(abs(y_pred-y_test)) / len(y_test)

428.81771548262105

In [96]:
#Fit random forest model. Optimized parameters and feature selection.
rf3 = RandomForestRegressor(n_estimators = 100, random_state = 22, oob_score = True, bootstrap = True, max_depth = 90, max_features = 'sqrt', min_samples_leaf = 3, min_samples_split = 5)
#Train the model on training data
rf3.fit(X_train, y_train);
#Find predicted trip durations in test set.
y_pred = list(rf3.predict(X_test))
train_pred = list(rf3.predict(X_train))
#MAE Test 
sum(abs(y_pred-y_test)) / len(y_test)
# MAE Train. 
#Train set error very close to test set error. We are not overfit.
sum(abs(train_pred-y_train)) / len(y_train)


372.57408978342266

In [47]:
# Final feature importance. 
fi_df = pd.DataFrame(list(zip(feature_names, rf3.feature_importances_)))
fi_df.sort_values(by = 1, ascending = False)

Unnamed: 0,0,1
1,average_temperature_farenheit,0.230666
0,birth_year,0.206453
3,median_rental_price,0.167736
2,"user_type_""Subscriber""",0.141789
4,start_hour,0.125079
5,total_precipitation_inches,0.079813
6,start_month,0.048464


In [59]:
from sklearn.tree import export_graphviz

In [95]:
# Create dot file of a tree from final forest. (Too big to visualize in final report.)
export_graphviz(rf3.estimators_[0],
                feature_names=feature_names,
                filled=True,
                rounded=True)

ValueError: Length of feature_names, 756 does not match number of features, 7