# Predicting demand of Uber Service
## Project 2: Modeling
## Jose Oros, Annamali Kathir

In [1]:
#import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, r2_score, mean_absolute_error, mean_squared_error


%matplotlib inline

In [2]:
trip_15_all = pd.read_csv('trip_15_all.csv')

In [3]:
trip_14_sub = pd.read_csv('trip_14_sub.csv')

We are going to predict the number of trips for a given time, day of the week and nta_code. Our baseline model will be the actual average of trips.

In [4]:
trips_all = pd.concat([trip_15_all, trip_14_sub])

In [5]:
trips_all.shape

(14305848, 11)

In [6]:
trips_all.to_csv('uber_trip.csv')

In [6]:
trips_all.columns

Index(['Unnamed: 0', 'pickup_datetime', 'nta_code', 'time', 'hour', 'day_week',
       'day_month', 'day_year', 'month', 'date_only', 'trip'],
      dtype='object')

In [7]:
#all data
trips_all['pickup_datetime'] = pd.to_datetime(trips_all['pickup_datetime'])
trips_all['time'] = trips_all.pickup_datetime.dt.time
trips_all['hour'] = trips_all.pickup_datetime.dt.hour
trips_all['day_week'] = trips_all.pickup_datetime.dt.dayofweek
trips_all['day_month'] = trips_all.pickup_datetime.dt.day
trips_all['day_year'] = trips_all.pickup_datetime.dt.dayofyear
trips_all['month'] = trips_all.pickup_datetime.dt.month
trips_all['date_only'] = trips_all.pickup_datetime.dt.date
trips_all['week_year'] = trips_all.pickup_datetime.dt.weekofyear

In [9]:
trips_aggregated = trips_all.groupby(['nta_code','week_year', 'day_week','hour']).agg({'trip': np.sum})

In [10]:
flatten = trips_aggregated.reset_index()

In [11]:
flatten.shape

(598510, 5)

In [12]:
## Baseline Model

In [13]:
base_model = flatten.groupby(['nta_code','day_week', 'hour']).agg({'trip':np.average}).reset_index()

In [14]:
base_model.shape

(30718, 4)

We are going to test the baseline model:

In [15]:
base_model.head()

Unnamed: 0,nta_code,day_week,hour,trip
0,BK09,0,0,12.461538
1,BK09,0,1,7.0
2,BK09,0,2,3.958333
3,BK09,0,3,2.130435
4,BK09,0,4,5.777778


In [16]:
#create data set for prediction

In [17]:
flatten.columns

Index(['nta_code', 'week_year', 'day_week', 'hour', 'trip'], dtype='object')

In [18]:
all_trip2 = flatten.drop(['week_year'], axis=1)

In [19]:
all_trip2.shape

(598510, 4)

Now we want to create a baseline model that we can use to evaluate the models

In [20]:
#merge the data sets for predictions
base_df = pd.merge(all_trip2, base_model, how='inner', on=['nta_code','day_week','hour'], suffixes=('_real','_avg'))

In [21]:
#calculate squared error for each prediction of the baseline model
base_df['mse'] = ((base_df['trip_avg'] - base_df['trip_real']) ** 2)

## Regression Models

In [51]:
#create dummy variables
#all_trip2 = flatten.drop(['week_year'], axis=1)
nta_dummies = pd.get_dummies(base_df['nta_code'])
all_trip3 = pd.concat([base_df,nta_dummies], axis=1)
all_trip3 = all_trip3.drop(['nta_code'], axis=1)

In [52]:
all_trip3.head()

Unnamed: 0,day_week,hour,trip_real,trip_avg,mse,BK09,BK17,BK19,BK21,BK23,...,SI25,SI28,SI32,SI35,SI36,SI37,SI45,SI48,SI54,SI99
0,3,0,50,14.153846,1284.946746,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0,11,14.153846,9.946746,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,12,14.153846,4.639053,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,8,14.153846,37.869822,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0,11,14.153846,9.946746,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
#split into feature and outcome data set
x = all_trip3.drop('trip_real', axis=1)
y = all_trip3.trip_real
x.shape, y.shape

((598510, 197), (598510,))

In [54]:
#split in train and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20)

In [55]:
x_train.shape

(478808, 197)

In [56]:
y_test.shape

(119702,)

### Linear regression

Now, we are going to run a regression model over the trips

In [57]:
#create regressor
x_train1 = x_train.drop(['trip_avg','mse'], axis=1)
x_test1 = x_test.drop(['trip_avg','mse'], axis=1)
regr = LinearRegression()
regr.fit(x_train1, y_train)
predicted = regr.predict(x_test1)

Evaluate the model:

In [58]:
# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error:")
print(np.mean((predicted - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('R2:')
print(regr.score(x_test1, y_test))

Mean squared error:
2669.3841931557376
R2:
0.488268304067


## Random Forest Regression

In [59]:
#Create regressor
rf_regr = RandomForestRegressor()
rf_regr.fit(x_train1, y_train)
predicted = rf_regr.predict(x_test1)

Evaluate the model

In [60]:
#Evaluate the model
#print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error:")
print(np.mean((predicted - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('R2:')
print(rf_regr.score(x_test1, y_test))

Mean squared error:
1817.253584188504
R2:
0.651625172217


In [61]:
#Baseline model comparison

In [62]:
np.mean(x_test['mse'])

1702.1726203550013