In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
import collections
%matplotlib inline

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot

In [2]:
# read in saved csv files - departure
pd.set_option('display.max_columns', 30)
filename = 'departure_joindf.csv'
departure_joindf = pd.read_csv(filename)
departure_joindf.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [3]:
train = departure_joindf[departure_joindf['year'].isin([2014,2015,2016])]

In [4]:
test = departure_joindf[departure_joindf['year']==2017]

In [5]:
trainlist = train['from_station_name'].unique().tolist()
len(trainlist)

418

In [6]:
testlist = test['from_station_name'].unique().tolist()
len(testlist)

396

In [7]:
joinedlist = set(trainlist) & set(testlist)
len(joinedlist)

379

In [8]:
departure_joindf_inclusive = departure_joindf[departure_joindf['from_station_name'].isin(joinedlist)]

In [9]:
# read in saved csv files - arrival
pd.set_option('display.max_columns', 30)
filename = 'arrival_joindf.csv'
arrival_joindf = pd.read_csv(filename)
arrival_joindf.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [10]:
train = arrival_joindf[arrival_joindf['year'].isin([2014,2015,2016])]

In [11]:
test = arrival_joindf[arrival_joindf['year']==2017]

In [12]:
trainlist = train['to_station_name'].unique().tolist()
len(trainlist)

418

In [13]:
testlist = test['to_station_name'].unique().tolist()
len(testlist)

396

In [14]:
joinedlist = set(trainlist) & set(testlist)
len(joinedlist)

379

In [15]:
arrival_joindf_inclusive = arrival_joindf[arrival_joindf['to_station_name'].isin(joinedlist)]

## Linear Model - departure

In [16]:
# create key to search the joindf
depstations = departure_joindf_inclusive['from_station_name'].unique() 

In [17]:
departure_joindf_inclusive = departure_joindf_inclusive.set_index('from_station_name')

In [18]:
departure_joindf_inclusive.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [19]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in depstations:
    
    X_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [20]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

-5.502529496046628e+21

In [21]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

131156027.94311333

In [22]:
depdf = pd.DataFrame({'station_name': depstations,
                      'r-squared': lmdeparture_rsquared,
                      'mean absolute error': lmdeparture_mae
                     })

In [23]:
depdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 3 columns):
station_name           379 non-null object
r-squared              379 non-null float64
mean absolute error    379 non-null float64
dtypes: float64(2), object(1)
memory usage: 9.0+ KB


In [24]:
depdf[(depdf['r-squared']>= -1) & (depdf['r-squared']<= 1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
0,2112 W Peterson Ave,-0.035604,0.1
1,63rd St Beach,0.024191,0.4
2,900 W Harrison St,0.032414,0.39
3,Adler Planetarium,0.030881,0.9
4,Albany (Kedzie) Ave & Montrose Ave,-0.082843,0.25
5,Albany Ave & 26th St,-0.042459,0.14
6,Albany Ave & Bloomingdale Ave,0.020231,0.35
7,Artesian Ave & Hubbard St,0.12531,0.98
8,Ashland Ave & 13th St,0.023375,0.3
9,Ashland Ave & 21st St,-0.105971,0.35


In [25]:
bad_stations = depdf[(depdf['r-squared']> 1) | (depdf['r-squared']< -1)]

In [26]:
bad_stations

Unnamed: 0,station_name,r-squared,mean absolute error
10,Ashland Ave & 50th St,-5.901295,0.42
22,Austin Blvd & Lake St,-16.33255,0.49
29,Broadway & Berwyn Ave,-3823195000000.0,44651.73
41,California Ave & Byron St,-5.795437e+22,7033744000.0
50,Calumet Ave & 71st St,-2.097299,0.3
64,Central Park Blvd & 5th Ave,-1.892614,0.18
71,Cicero Ave & Lake St,-1.464733,0.53
96,Cottage Grove Ave & 67th St,-1.241256e+24,17113990000.0
98,Cottage Grove Ave & 78th St,-157.179,1.25
99,Cottage Grove Ave & 83rd St,-27.23926,0.45


In [27]:
bad_station = bad_stations['station_name'].tolist()

In [28]:
yearlist = []

for k in bad_station:
    yearlist.append(departure_joindf_inclusive.loc[k]['year'].value_counts())
    
    

In [29]:
yearlist

[2017    44
 2016    19
 Name: year, dtype: int64, 2017    193
 2016    138
 Name: year, dtype: int64, 2016    2631
 2017    2623
 2015    1950
 2014     970
 Name: year, dtype: int64, 2016    592
 2017    583
 2015    331
 Name: year, dtype: int64, 2015    44
 2017    19
 2016    17
 Name: year, dtype: int64, 2017    87
 2016    40
 Name: year, dtype: int64, 2017    46
 2016    31
 Name: year, dtype: int64, 2015    148
 2017    143
 2016    140
 Name: year, dtype: int64, 2017    150
 2016     13
 Name: year, dtype: int64, 2016    120
 2017     81
 Name: year, dtype: int64, 2017    1600
 2016    1501
 2015    1223
 2014     836
 Name: year, dtype: int64, 2016    264
 2017    235
 2015    171
 Name: year, dtype: int64, 2017    2319
 2016    1914
 2015    1275
 Name: year, dtype: int64, 2017    72
 2016    14
 Name: year, dtype: int64, 2017    22
 2015    17
 2016     7
 Name: year, dtype: int64, 2017    131
 2015     37
 2016      7
 Name: year, dtype: int64, 2017    111
 2016     20
 N

In [30]:
departure_joindf_inclusive = departure_joindf_inclusive.reset_index()

In [31]:
departure_joindf_final = departure_joindf_inclusive[~departure_joindf_inclusive['from_station_name'].isin(bad_station)]


In [32]:
departure_joindf_final.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [33]:
# create key to search the joindf
depstations = departure_joindf_final['from_station_name'].unique() 

In [34]:
departure_joindf_final = departure_joindf_final.set_index('from_station_name')

In [35]:
departure_joindf_final.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [36]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in depstations:
    
    X_train = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [37]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

-0.043829911581220383

In [38]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

0.3398850574712642

In [39]:
departure_joindf_final.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [40]:
#departure_joindf_final = departure_joindf_final.reset_index()

In [41]:
#departure_joindf_final.to_csv('departure_joindf_model.csv')

## Linear Model - arrival

In [42]:
# create key to search the joindf
arrstations = arrival_joindf_inclusive['to_station_name'].unique() 

In [43]:
arrival_joindf_inclusive = arrival_joindf_inclusive.set_index('to_station_name')

In [44]:
arrival_joindf_inclusive.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [45]:
lmarrival_rsquared = []
lmarrival_mae = []

for k in arrstations:
    
    X_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))

In [46]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

-6.655748470061178e+22

In [47]:
sum(lmarrival_mae)/len(lmarrival_mae)

645551274.8942214

In [48]:
arrdf = pd.DataFrame({'station_name': arrstations,
                      'r-squared': lmarrival_rsquared,
                      'mean absolute error': lmarrival_mae
                     })

In [49]:
arrdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 3 columns):
station_name           379 non-null object
r-squared              379 non-null float64
mean absolute error    379 non-null float64
dtypes: float64(2), object(1)
memory usage: 9.0+ KB


In [50]:
arrdf[(arrdf['r-squared']>= -1) & (arrdf['r-squared']<= 1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
0,2112 W Peterson Ave,-0.012289,0.11
1,63rd St Beach,0.022198,0.41
2,900 W Harrison St,-0.036259,0.41
3,Adler Planetarium,0.015699,0.86
4,Albany (Kedzie) Ave & Montrose Ave,0.055847,0.29
5,Albany Ave & 26th St,-0.089767,0.17
6,Albany Ave & Bloomingdale Ave,-0.07429,0.27
7,Artesian Ave & Hubbard St,0.019227,0.46
8,Ashland Ave & 13th St,0.015909,0.32
9,Ashland Ave & 21st St,-0.065223,0.27


In [51]:
bad_stations = arrdf[(arrdf['r-squared']> 1) | (arrdf['r-squared']< -1)]

In [52]:
bad_station = bad_stations['station_name'].tolist()

In [53]:
arrival_joindf_inclusive = arrival_joindf_inclusive.reset_index()

In [54]:
arrival_joindf_final = arrival_joindf_inclusive[~arrival_joindf_inclusive['to_station_name'].isin(bad_station)]


In [55]:
arrival_joindf_final.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [56]:
# create key to search the joindf
arrstations = arrival_joindf_final['to_station_name'].unique() 


In [57]:
arrival_joindf_final = arrival_joindf_final.set_index('to_station_name')


In [58]:
arrival_joindf_final.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [59]:
lmarrival_rsquared = []
lmarrival_mae = []

for k in arrstations:
    
    X_train = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))

In [60]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

-0.031089122501327355

In [61]:
sum(lmarrival_mae)/len(lmarrival_mae)

0.32894586894586864

In [62]:
arrival_joindf_final.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [63]:
arrival_joindf_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 794298 entries, 2112 W Peterson Ave to Woodlawn Ave & Lake Park Ave
Data columns (total 18 columns):
year                       794298 non-null int64
month                      794298 non-null int64
day                        794298 non-null int64
hour                       794298 non-null int64
num_day_of_week            794298 non-null int64
tripduration               794298 non-null float64
temperature                794298 non-null float64
events_cloudy_pct          794298 non-null float64
events_clear_pct           794298 non-null float64
events_not clear_pct       794298 non-null float64
events_rain or snow_pct    794298 non-null float64
events_tstorms_pct         794298 non-null float64
usertype_Customer_pct      794298 non-null float64
usertype_Dependent_pct     794298 non-null float64
usertype_Subscriber_pct    794298 non-null float64
gender_Female_pct          794298 non-null float64
gender_Male_pct            794298 non-null float

## RF Model - departure

In [64]:
departure_joindf_final.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [65]:
rfdeparture_rsquared = []
rfdeparture_mae = []
for k in depstations:
    
    X_train = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_final.loc[k][departure_joindf_final.loc[k]['year']==2017]['trip_count'].values

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfdeparture_rsquared.append(rf.score(X_test, y_test))
    
    rfdeparture_mae.append(round(np.mean(errors), 2))

In [66]:
sum(rfdeparture_rsquared)/len(rfdeparture_rsquared)

0.3414229835663846

In [67]:
sum(rfdeparture_mae)/len(rfdeparture_mae)

0.19017241379310343

## RF Model - arrival

In [68]:
rfarrival_rsquared = []
rfarrival_mae = []

for k in arrstations:
    
    X_train = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = arrival_joindf_final.loc[k][arrival_joindf_final.loc[k]['year']==2017]['trip_count'].values

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfarrival_rsquared.append(rf.score(X_test, y_test))
    
    rfarrival_mae.append(round(np.mean(errors), 2))

In [69]:
sum(rfarrival_rsquared)/len(rfarrival_rsquared)

0.3425081131468862

In [70]:
sum(rfarrival_mae)/len(rfarrival_mae)

0.18404558404558402

# SGD

In [71]:
Xtraindata = departure_joindf_final[departure_joindf_final['year'].isin([2014,2015,2016])].drop(['trip_count'],axis=1)
X_train = pd.get_dummies(Xtraindata, 'from_station_name').values
X_test = departure_joindf_final[departure_joindf_final['year']==2017].drop(['trip_count'], axis=1).values

ydata = departure_joindf_final['trip_count'].values
y_train = departure_joindf_final[departure_joindf_final['year'].isin([2014,2015,2016])]['trip_count'].values
y_test = departure_joindf_final[departure_joindf_final['year']==2017]['trip_count'].values

sgd = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
sgd.fit(X_train, y_train)



SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=0.001, verbose=0,
       warm_start=False)

In [72]:
y_pred = sgd.predict(X_test)

In [73]:
mse = mean_squared_error(y_test, y_pred)

In [74]:
import math

In [75]:
import math
print("RMSE: ", math.sqrt(mse))

RMSE:  124531488648190.06


In [76]:
sgd.score(X_test,y_test)

-1.8032871541382652e+28

In [77]:
errors = abs(y_pred - y_test)