# Build Predictive Model

In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, roc_curve, auc, f1_score, confusion_matrix, classification_report
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor



In [2]:
# Bring in all trip data
full_df = pd.read_csv('data/trips/full.csv')

In [3]:
# Convert start and end dates to datetimes
full_df['start_date'] = pd.to_datetime(full_df['start_date'])
full_df['end_date'] = pd.to_datetime(full_df['end_date'])

In [4]:
# Set the start date as the index
full_df = full_df.set_index('start_date')

In [5]:
# Create a "Month" column
full_df['month'] = full_df.index.month

In [6]:
# Create a function to turn day of week into a binary category with weekday or weekend as options
def weekday_or_weekend(row):
    if row['day_of_week'] in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']:
        return 'Weekday'
    else:
        return 'Weekend'

In [7]:
# Apply this function to the dataframe
full_df['weekday_weekend'] = full_df.apply(weekday_or_weekend, axis=1)

In [8]:
# Convert hour into a circular time feature
full_df['hour_x']=np.sin(2.*np.pi*full_df.start_hour/24.)
full_df['hour_y']=np.cos(2.*np.pi*full_df.start_hour/24.)

In [9]:
# Convert month into a circular time feature
full_df['month_x']=np.sin(2.*np.pi*full_df.month/12.)
full_df['month_y']=np.cos(2.*np.pi*full_df.month/12.)

In [10]:
# Select columns for model dataframe
full_df = full_df[['HourlyDryBulbTemperature', 'precip_cat', 'hour_x', 'hour_y', 'month_x', 'month_y', 'weekday_weekend']]

In [11]:
# Convert rain and weekday into dummy variables
full_df = pd.get_dummies(full_df, drop_first=True, columns=['precip_cat', 'weekday_weekend'])

In [12]:
# Resample to get mean values for each hour
full_hourly_df = full_df.resample('H').mean()

In [13]:
# Resample to get count of rides each hour
ride_counts_df = full_df.resample('H').count()

In [14]:
# Add ride count to the dataframe containing averages
full_hourly_df['ride_count'] = ride_counts_df['hour_x']

In [15]:
# Replace NaNs with zeroes
full_hourly_df = full_hourly_df.fillna(0)

In [16]:
# We will train on 2018 and test on 2019
df_2018 = full_hourly_df.loc['2018-01-01':'2018-12-31']
df_2019 = full_hourly_df.loc['2019-01-01':'2019-12-31']

In [17]:
# Split into train and test
y_train = np.ravel(df_2018['ride_count'])
x_train = df_2018.drop(columns=['ride_count'])
y_test = np.ravel(df_2019['ride_count'])
x_test = df_2019.drop(columns=['ride_count'])

In [18]:
# Scale x (feature) values
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [19]:
# Set random forest parameters and fit to our data
forest = RandomForestRegressor(n_estimators=100, max_depth=20)
forest.fit(x_train_scaled, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [20]:
# View training score
training_score = forest.score(x_train_scaled, y_train)
test_score = forest.score(x_test_scaled, y_test)
print("This model's training score is {} and it's test score is {}.".format(training_score, test_score))

This model's training score is 0.9865767669369423 and it's test score is 0.8922353032618627.


In [21]:
# Set gridsearch grid to tune hyperparameters
param_grid = {
    "max_depth": [5, 10, 15, 20, 25, 30],
    "min_samples_split": [2, 5, 10],
    "n_estimators": [10, 50, 100, 200]
}

In [22]:
# Run gridsearch
rf_grid_search = GridSearchCV(forest, param_grid, cv=3, return_train_score=True)
rf_grid_search.fit(x_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=20, max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [5, 10, 15, 20, 25, 30],
                         'min_samples_split

In [23]:
# Return the best parameters
rf_grid_search.best_params_

{'max_depth': 25, 'min_samples_split': 10, 'n_estimators': 100}

In [24]:
# Re-run with the best parameters
forest = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=10)
forest.fit(x_train_scaled, y_train)
training_score = forest.score(x_train_scaled, y_train)
test_score = forest.score(x_test_scaled, y_test)
print("This model's training score is {} and it's test score is {}.".format(training_score, test_score))

This model's training score is 0.9634059249324912 and it's test score is 0.8974493622727631.
