In [1]:
import pandas as pd
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
import rtree
import pickle

from sklearn.model_selection import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import *
from sklearn.metrics import mean_squared_error
from catboost import Pool, CatBoostRegressor
import statsmodels.formula.api as sm
from sklearn.metrics import *
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from run_model import *
import gmaps
gmaps.configure(api_key=secret_key.google_api_key)

In [2]:
def clean_citibike(df):
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['Stop Time'] = pd.to_datetime(df['Stop Time'])
    
    df['DATE'] = df['Start Time'].dt.date
    
    df['day'] = df['Start Time'].dt.day
    df['month'] = df['Start Time'].dt.month
    df['year'] = df['Start Time'].dt.year
    
    df['day_of_week_name'] = df['Start Time'].dt.weekday_name
    df['day_of_week'] = df['Start Time'].dt.weekday
    
    df['time_of_day'] = df['Start Time'].dt.hour
    df["bin_time_of_day"] = df["Start Time"].apply(bin_time)
    
    grp = df.groupby(['Start Station Name', 'DATE'])

    df = grp['Bike ID'].count().reset_index()
    df = df.rename(index=str, columns={'Bike ID': "target"})
    
    df['DATE'] = pd.to_datetime(df['DATE'])
    
    df['day'] = df['DATE'].dt.day
    df['month'] = df['DATE'].dt.month
    df['year'] = df['DATE'].dt.year
    
    df['day_of_week_name'] = df['DATE'].dt.weekday_name
    df['day_of_week'] = df['DATE'].dt.weekday
    
    df['log_target'] = df['target'].apply(lambda x: np.log(x))
    
    return df

In [3]:
bike_df0 = pd.read_csv("data/citibike/201701-citibike-tripdata.csv.zip")
bike_df1 = pd.read_csv("data/citibike/201702-citibike-tripdata.csv.zip")
bike_df2 = pd.read_csv("data/citibike/201703-citibike-tripdata.csv.zip")
bike_df3 = pd.read_csv("data/citibike/201704-citibike-tripdata.csv.zip")
bike_df3 = bike_df3.rename({'tripduration':'Trip Duration', 'starttime':"Start Time", 'stoptime':"Stop Time", 'start station id': "Start Station ID",
       'start station name': "Start Station Name", 'start station latitude':" Start Station Latitude",
       'start station longitude':"Start Station Longitude", 'end station id' : "End Station ID", 'end station name':"End Station Name",
       'end station latitude':"End Station Latitude", 'end station longitude':"End Station Longitude", 'bikeid':"Bike ID", 'usertype':"User Type",
       'birth year':"Birt Year", 'gender':"Gender"}, axis=1)

bike_df4 = pd.read_csv("data/citibike/201705-citibike-tripdata.csv.zip")
bike_df4 = bike_df4.rename({'tripduration':'Trip Duration', 'starttime':"Start Time", 'stoptime':"Stop Time", 'start station id': "Start Station ID",
       'start station name': "Start Station Name", 'start station latitude':" Start Station Latitude",
       'start station longitude':"Start Station Longitude", 'end station id' : "End Station ID", 'end station name':"End Station Name",
       'end station latitude':"End Station Latitude", 'end station longitude':"End Station Longitude", 'bikeid':"Bike ID", 'usertype':"User Type",
       'birth year':"Birt Year", 'gender':"Gender"}, axis=1)

bike_df5 = pd.read_csv("data/citibike/201706-citibike-tripdata.csv.zip")
bike_df5 = bike_df5.rename({'tripduration':'Trip Duration', 'starttime':"Start Time", 'stoptime':"Stop Time", 'start station id': "Start Station ID",
       'start station name': "Start Station Name", 'start station latitude':" Start Station Latitude",
       'start station longitude':"Start Station Longitude", 'end station id' : "End Station ID", 'end station name':"End Station Name",
       'end station latitude':"End Station Latitude", 'end station longitude':"End Station Longitude", 'bikeid':"Bike ID", 'usertype':"User Type",
       'birth year':"Birt Year", 'gender':"Gender"}, axis=1)

# bike_df6 = pd.read_csv("data/citibike/201707-citibike-tripdata.csv.zip")
# bike_df7 = pd.read_csv("data/citibike/201708-citibike-tripdata.csv.zip")
# bike_df8 = pd.read_csv("data/citibike/201709-citibike-tripdata.csv.zip")
# bike_df9 = pd.read_csv("data/citibike/201710-citibike-tripdata.csv.zip")
# bike_df10 = pd.read_csv("data/citibike/201711-citibike-tripdata.csv.zip")
# bike_df11 = pd.read_csv("data/citibike/201712-citibike-tripdata.csv.zip")

In [4]:
bike_df = pd.concat([bike_df0, bike_df1, bike_df2, bike_df3, bike_df4, bike_df5])
#, bike_df6])#, bike_df7, bike_df8, bike_df9, bike_df10, bike_df11])

# df = data.sort_values('DATE')
# rows = df.shape[0]
del bike_df0
del bike_df1
del bike_df2
del bike_df3
del bike_df4
del bike_df5

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
bike_df.head()

Unnamed: 0,Start Station Latitude,Bike ID,Birt Year,Birth Year,End Station ID,End Station Latitude,End Station Longitude,End Station Name,Gender,Start Station ID,Start Station Latitude.1,Start Station Longitude,Start Station Name,Start Time,Stop Time,Trip Duration,User Type
0,,25542,,1965.0,3165,40.775794,-73.976206,Central Park West & W 72 St,2,3226,40.78275,-73.97137,W 82 St & Central Park West,2017-01-01 00:00:21,2017-01-01 00:11:41,680,Subscriber
1,,21136,,1987.0,498,40.748549,-73.988084,Broadway & W 32 St,2,3263,40.729236,-73.990868,Cooper Square & E 7 St,2017-01-01 00:00:45,2017-01-01 00:22:08,1282,Subscriber
2,,18147,,,3152,40.768737,-73.961199,3 Ave & E 71 St,0,3143,40.776829,-73.963888,5 Ave & E 78 St,2017-01-01 00:00:57,2017-01-01 00:11:46,648,Customer
3,,21211,,,3152,40.768737,-73.961199,3 Ave & E 71 St,0,3143,40.776829,-73.963888,5 Ave & E 78 St,2017-01-01 00:01:10,2017-01-01 00:11:42,631,Customer
4,,26819,,,3152,40.768737,-73.961199,3 Ave & E 71 St,0,3143,40.776829,-73.963888,5 Ave & E 78 St,2017-01-01 00:01:25,2017-01-01 00:11:47,621,Customer


In [6]:
# bike_df = bike_df.sort_values('DATE')
rows = bike_df.shape[0]
train = bike_df.head(int(rows*0.8))
test = bike_df.tail(int(rows*0.2))
del bike_df

In [7]:
train = clean_citibike(train)
test = clean_citibike(test)

In [8]:
train.head()

Unnamed: 0,Start Station Name,DATE,target,day,month,year,day_of_week_name,day_of_week,log_target
0,1 Ave & E 16 St,2017-01-01,66,1,1,2017,Sunday,6,4.189655
1,1 Ave & E 16 St,2017-01-02,54,2,1,2017,Monday,0,3.988984
2,1 Ave & E 16 St,2017-01-03,77,3,1,2017,Tuesday,1,4.343805
3,1 Ave & E 16 St,2017-01-04,136,4,1,2017,Wednesday,2,4.912655
4,1 Ave & E 16 St,2017-01-05,116,5,1,2017,Thursday,3,4.75359


## RandomForest

In [9]:
features = ['time_of_day', 'day', 'month', 'year', 'day_of_week']

In [10]:
def randomforestModel(col, label):
    rfrmodel = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    reg = rfrmodel.fit(train[col], train[label])

    training_accuracy = reg.score(train[col], train[label])
    test_accuracy = reg.score(test[col], test[label])

    log_error = mean_squared_log_error(test[label], rfrmodel.predict(test[col]))

    print("############# based on standard predict ################")
    print("R^2 on training data: %0.4f" % (training_accuracy))
    print("R^2 on test data:     %0.4f" % (test_accuracy))
    print("Log on test data:     %0.4f" % (log_error))

    y_test = test[label]
    y_pred = rfrmodel.predict(test[col])
    
#     test['log_pred'] = y_pred
    
#     test['pred'] = np.exp(y_pred)
    test['pred'] = y_pred
    
    percent_error = (abs(y_pred - y_test + 1))/abs(y_test + 1)
    print("Percent Error:" , np.mean(percent_error))

    avg_error = (abs(y_pred - y_test + 1))
    print("Avg Error:", np.mean(avg_error))

    test['percent_error'] = percent_error
    test['avg_error'] = avg_error
    

In [11]:
randomforestModel(features, 'target')

############# based on standard predict ################
R^2 on training data: 0.1760
R^2 on test data:     0.0908
Log on test data:     0.5275
Percent Error: 0.940113280538015
Avg Error: 4.344965843718392


In [12]:
test[['target', 'pred']]

Unnamed: 0,target,pred
0,14,5.882199
1,19,5.923665
2,13,6.895396
3,13,3.885961
4,12,3.303776
5,4,2.669535
6,4,2.971891
7,3,1.742978
8,4,7.879766
9,20,4.912039


## CatBoost

In [11]:
features = ['Start Station Name', 'day', 'month', 'year', 'day_of_week_name']
cat = [0]

cat_train = train[features]
cat_test = test[features]
train_pool = Pool(cat_train, train['target'], cat_features=[0, 4])
test_pool = Pool(cat_test, cat_features=[0, 4])
catmodel = CatBoostRegressor(iterations=500, depth=15)
catmodel.fit(train_pool)

0:	learn: 85.8992501	total: 439ms	remaining: 3m 38s
1:	learn: 83.8339775	total: 769ms	remaining: 3m 11s
2:	learn: 81.8302302	total: 1.09s	remaining: 3m
3:	learn: 79.8934936	total: 1.46s	remaining: 3m
4:	learn: 78.0408636	total: 1.8s	remaining: 2m 58s
5:	learn: 76.2418783	total: 2.12s	remaining: 2m 54s
6:	learn: 74.5103028	total: 2.42s	remaining: 2m 50s
7:	learn: 72.8367941	total: 2.71s	remaining: 2m 46s
8:	learn: 71.2272283	total: 3.01s	remaining: 2m 44s
9:	learn: 69.6685189	total: 3.32s	remaining: 2m 42s
10:	learn: 68.1663879	total: 3.63s	remaining: 2m 41s
11:	learn: 66.7085396	total: 3.93s	remaining: 2m 39s
12:	learn: 65.3032305	total: 4.22s	remaining: 2m 38s
13:	learn: 63.9488569	total: 4.55s	remaining: 2m 37s
14:	learn: 62.6604252	total: 4.84s	remaining: 2m 36s
15:	learn: 61.4074240	total: 5.13s	remaining: 2m 35s
16:	learn: 60.1974216	total: 5.41s	remaining: 2m 33s
17:	learn: 59.0314072	total: 5.69s	remaining: 2m 32s
18:	learn: 57.9171725	total: 6s	remaining: 2m 31s
19:	learn: 56.8

156:	learn: 31.5887157	total: 51s	remaining: 1m 51s
157:	learn: 31.5708610	total: 51.3s	remaining: 1m 51s
158:	learn: 31.5350539	total: 51.7s	remaining: 1m 50s
159:	learn: 31.5127777	total: 52s	remaining: 1m 50s
160:	learn: 31.4917476	total: 52.3s	remaining: 1m 50s
161:	learn: 31.4714901	total: 52.6s	remaining: 1m 49s
162:	learn: 31.4494623	total: 52.9s	remaining: 1m 49s
163:	learn: 31.4303338	total: 53.3s	remaining: 1m 49s
164:	learn: 31.4074738	total: 53.6s	remaining: 1m 48s
165:	learn: 31.3891375	total: 53.9s	remaining: 1m 48s
166:	learn: 31.3689936	total: 54.2s	remaining: 1m 48s
167:	learn: 31.3416101	total: 54.5s	remaining: 1m 47s
168:	learn: 31.3042822	total: 54.8s	remaining: 1m 47s
169:	learn: 31.2941576	total: 55.2s	remaining: 1m 47s
170:	learn: 31.2845136	total: 55.5s	remaining: 1m 46s
171:	learn: 31.2482941	total: 55.9s	remaining: 1m 46s
172:	learn: 31.2180556	total: 56.2s	remaining: 1m 46s
173:	learn: 31.2029800	total: 56.5s	remaining: 1m 45s
174:	learn: 31.1876411	total: 56

307:	learn: 29.1842769	total: 1m 41s	remaining: 1m 3s
308:	learn: 29.1774711	total: 1m 42s	remaining: 1m 3s
309:	learn: 29.1668467	total: 1m 42s	remaining: 1m 2s
310:	learn: 29.1474848	total: 1m 42s	remaining: 1m 2s
311:	learn: 29.1416594	total: 1m 43s	remaining: 1m 2s
312:	learn: 29.1280463	total: 1m 43s	remaining: 1m 1s
313:	learn: 29.1078670	total: 1m 44s	remaining: 1m 1s
314:	learn: 29.0936592	total: 1m 44s	remaining: 1m 1s
315:	learn: 29.0708983	total: 1m 44s	remaining: 1m 1s
316:	learn: 29.0533427	total: 1m 45s	remaining: 1m
317:	learn: 29.0432322	total: 1m 45s	remaining: 1m
318:	learn: 29.0166992	total: 1m 45s	remaining: 1m
319:	learn: 29.0071720	total: 1m 46s	remaining: 59.8s
320:	learn: 28.9995597	total: 1m 46s	remaining: 59.4s
321:	learn: 28.9872994	total: 1m 46s	remaining: 59.1s
322:	learn: 28.9730533	total: 1m 47s	remaining: 58.8s
323:	learn: 28.9613458	total: 1m 47s	remaining: 58.5s
324:	learn: 28.9521252	total: 1m 48s	remaining: 58.2s
325:	learn: 28.9288850	total: 1m 48s	

460:	learn: 27.1638461	total: 2m 34s	remaining: 13.1s
461:	learn: 27.1514829	total: 2m 34s	remaining: 12.7s
462:	learn: 27.1427273	total: 2m 35s	remaining: 12.4s
463:	learn: 27.1361059	total: 2m 35s	remaining: 12.1s
464:	learn: 27.1174778	total: 2m 35s	remaining: 11.7s
465:	learn: 27.1132196	total: 2m 36s	remaining: 11.4s
466:	learn: 27.1016993	total: 2m 36s	remaining: 11.1s
467:	learn: 27.0889817	total: 2m 36s	remaining: 10.7s
468:	learn: 27.0695004	total: 2m 37s	remaining: 10.4s
469:	learn: 27.0578959	total: 2m 37s	remaining: 10.1s
470:	learn: 27.0468265	total: 2m 37s	remaining: 9.73s
471:	learn: 27.0401028	total: 2m 38s	remaining: 9.39s
472:	learn: 27.0198854	total: 2m 38s	remaining: 9.06s
473:	learn: 27.0077462	total: 2m 38s	remaining: 8.72s
474:	learn: 26.9928484	total: 2m 39s	remaining: 8.39s
475:	learn: 26.9727209	total: 2m 39s	remaining: 8.05s
476:	learn: 26.9628405	total: 2m 40s	remaining: 7.72s
477:	learn: 26.9540370	total: 2m 40s	remaining: 7.38s
478:	learn: 26.9384390	total

<catboost.core.CatBoostRegressor at 0x7f278d23c518>

In [None]:
# n_estimators_range = range(1,100)
# min_samples_split_range = range(2,50)
# min_samples_leaf_range = range(2,50)
# max_depth_range = range(1,20)

# params = {'depth':[3,1,2,6,4,5,7,8,9,10],
#           'iterations':[250,100,500,1000],
#           'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
#           'l2_leaf_reg':[3,1,5,10,100],
#           'border_count':[32,5,10,20,50,100,200],
#           'ctr_border_count':[50,5,10,20,100,200],
#           'thread_count':4}

# search = RandomizedSearchCV(catmodel, params)
# search.fit(cat_train, train['target'], cat_features=[0, 5])
# # clf.fit(train, np.ravel(labels), cat_features=cat_dims)

In [14]:
y_pred = catmodel.predict(test_pool)
y_test = test['target']
test['pred']= y_pred
# test['log_pred'] = y_pred

# test['pred'] = np.exp(y_pred)
# y_pred = test['pred']a



# log_error = mean_squared_log_error(y_test, y_pred)

print("############# based on standard predict ################")

# print("Log on test data:     %0.4f" % (log_error))

percent_error = (abs(y_pred - y_test + 1))/abs(y_test + 1)
print("Percent Error:" , np.mean(percent_error))

avg_error = (abs(y_pred - y_test + 1))
print("Avg Error:", np.mean(avg_error))

test['percent_error'] = percent_error
test['avg_error'] = avg_error
# catmodel.score(test['target'], test['pred'])

############# based on standard predict ################
Percent Error: 0.40747672327891127
Avg Error: 35.08084478329946


In [15]:
test[['target', 'pred']]

Unnamed: 0,target,pred
0,79,141.872739
1,188,156.580952
2,244,164.755496
3,231,53.286941
4,212,110.292234
5,181,226.004621
6,235,226.438310
7,239,199.531690
8,201,209.400191
9,232,168.347403
