In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
import collections
%matplotlib inline

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot

# Raw DataFrame

In [2]:
# read in the Chicagov Divvy Bicycle Sharing Data csv files from Kaggle
pd.set_option('display.max_columns', 30)
filename = 'data.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,trip_id,year,month,week,day,hour,usertype,gender,starttime,stoptime,tripduration,temperature,events,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end
0,2355134,2014,6,27,0,23,Subscriber,Male,2014-06-30 23:57:00,2014-07-01 00:07:00,10.066667,68.0,tstorms,131,Lincoln Ave & Belmont Ave,41.939365,-87.668385,15.0,303,Broadway & Cornelia Ave,41.945512,-87.64598,15.0
1,2355133,2014,6,27,0,23,Subscriber,Male,2014-06-30 23:56:00,2014-07-01 00:00:00,4.383333,68.0,tstorms,282,Halsted St & Maxwell St,41.86458,-87.64693,15.0,22,May St & Taylor St,41.869482,-87.655486,15.0
2,2355130,2014,6,27,0,23,Subscriber,Male,2014-06-30 23:33:00,2014-06-30 23:35:00,2.1,68.0,tstorms,327,Sheffield Ave & Webster Ave,41.921687,-87.653714,19.0,225,Halsted St & Dickens Ave,41.919936,-87.64883,15.0
3,2355129,2014,6,27,0,23,Subscriber,Female,2014-06-30 23:26:00,2014-07-01 00:24:00,58.016667,68.0,tstorms,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0,194,State St & Wacker Dr,41.887155,-87.62775,11.0
4,2355128,2014,6,27,0,23,Subscriber,Female,2014-06-30 23:16:00,2014-06-30 23:26:00,10.633333,68.0,tstorms,320,Loomis St & Lexington St,41.872187,-87.661501,15.0,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0


# Data Cleaning and Wrangling

### Trip ID Column

In [3]:
# drop rows with duplicate data 
data = df.copy()
data = data.drop_duplicates().sort_values('trip_id')

### General cleaning steps for easier use

In [4]:
# convert start and stop times to datetimes
data['starttime'] = pd.to_datetime(data['starttime'])
data['stoptime'] = pd.to_datetime(data['stoptime'])

In [5]:
# rename day column to numerical day of week
data = data.rename(columns={'day':'num_day_of_week'})

In [6]:
# create new column with the name of the week 
data['day_of_week'] = data['starttime'].dt.weekday_name

In [7]:
# create new column with numerical day of the month
data['day'] = data['starttime'].dt.day

In [8]:
# sort columns
pd.set_option('display.max_columns', 30)
data = data[['trip_id','year','month','week','day','hour','num_day_of_week','day_of_week',
             'starttime','stoptime','tripduration','temperature','events','usertype','gender',
             'from_station_id','from_station_name','latitude_start','longitude_start','dpcapacity_start',
             'to_station_id','to_station_name','latitude_end','longitude_end','dpcapacity_end']]
data.head()

Unnamed: 0,trip_id,year,month,week,day,hour,num_day_of_week,day_of_week,starttime,stoptime,tripduration,temperature,events,usertype,gender,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end
576426,1109427,2014,1,1,1,1,2,Wednesday,2014-01-01 01:12:00,2014-01-01 01:18:00,5.766667,10.9,not clear,Subscriber,Male,240,Sheridan Rd & Irving Park Rd,41.954245,-87.654406,23.0,245,Clarendon Ave & Junior Ter,41.961004,-87.649603,15.0
576425,1109431,2014,1,1,1,1,2,Wednesday,2014-01-01 01:43:00,2014-01-01 01:53:00,10.833333,10.9,not clear,Subscriber,Male,113,Bissell St & Armitage Ave,41.91844,-87.65222,15.0,94,Clark St & Armitage Ave,41.918306,-87.636282,19.0
576424,1109432,2014,1,1,1,1,2,Wednesday,2014-01-01 01:43:00,2014-01-01 01:53:00,10.866667,10.9,not clear,Subscriber,Male,113,Bissell St & Armitage Ave,41.91844,-87.65222,15.0,94,Clark St & Armitage Ave,41.918306,-87.636282,19.0
576423,1109434,2014,1,1,1,1,2,Wednesday,2014-01-01 01:44:00,2014-01-01 01:59:00,15.266667,10.9,not clear,Subscriber,Male,50,Clark St & Congress Pkwy,41.875933,-87.630585,27.0,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0
576422,1109435,2014,1,1,1,1,2,Wednesday,2014-01-01 01:45:00,2014-01-01 02:00:00,15.066667,10.9,not clear,Subscriber,Female,50,Clark St & Congress Pkwy,41.875933,-87.630585,27.0,134,Peoria St & Jackson Blvd,41.877749,-87.649633,19.0


# Remove Negative

In [9]:
departures = data[['starttime','from_station_name','trip_id']].sort_values(['from_station_name','starttime'])
departures = departures.rename(columns={'starttime':'datetime','from_station_name':'station_name'})

In [10]:
# each departure is given a 1 value
departures['trip_counts'] = 1

In [11]:
arrivals = data[['starttime','to_station_name','trip_id']].sort_values(['to_station_name','starttime'])
arrivals = arrivals.rename(columns={'starttime':'datetime','to_station_name':'station_name'})

In [12]:
# each arrivals is given a -1 value
arrivals['trip_counts'] = -1

In [13]:
departures_and_arrivals = pd.concat([departures, arrivals])
departures_and_arrivals = departures_and_arrivals.sort_values(['station_name','datetime'])
departures_and_arrivals['date'] = departures_and_arrivals['datetime'].dt.date
departures_and_arrivals = departures_and_arrivals[['datetime','date','station_name','trip_id','trip_counts']]
departures_and_arrivals.head()

Unnamed: 0,datetime,date,station_name,trip_id,trip_counts
2146845,2015-05-17 12:41:00,2015-05-17,2112 W Peterson Ave,5197801,-1
2146213,2015-05-17 14:06:00,2015-05-17,2112 W Peterson Ave,5199604,1
2140550,2015-05-18 10:36:00,2015-05-18,2112 W Peterson Ave,5212990,-1
2140518,2015-05-18 10:47:00,2015-05-18,2112 W Peterson Ave,5213095,1
2107385,2015-05-22 16:41:00,2015-05-22,2112 W Peterson Ave,5274966,-1


In [14]:
# get running sum of departures-arrivals
departures_and_arrivals['cumulative_sum'] = departures_and_arrivals.groupby(['station_name','date'])['trip_counts'].cumsum()
departures_and_arrivals.head()


Unnamed: 0,datetime,date,station_name,trip_id,trip_counts,cumulative_sum
2146845,2015-05-17 12:41:00,2015-05-17,2112 W Peterson Ave,5197801,-1,-1
2146213,2015-05-17 14:06:00,2015-05-17,2112 W Peterson Ave,5199604,1,0
2140550,2015-05-18 10:36:00,2015-05-18,2112 W Peterson Ave,5212990,-1,-1
2140518,2015-05-18 10:47:00,2015-05-18,2112 W Peterson Ave,5213095,1,0
2107385,2015-05-22 16:41:00,2015-05-22,2112 W Peterson Ave,5274966,-1,-1


In [15]:
# maximum departures-arrivals for each station 
max_sum = departures_and_arrivals.groupby('station_name')['cumulative_sum'].max().reset_index()
max_sum.head()

Unnamed: 0,station_name,cumulative_sum
0,2112 W Peterson Ave,5
1,63rd St Beach,4
2,900 W Harrison,16
3,900 W Harrison St,15
4,Aberdeen St & Jackson Blvd,16


In [16]:
# maximum capacity at each station
max_cap = data[['from_station_name','dpcapacity_start']].drop_duplicates().groupby('from_station_name').max()
max_cap.head()

Unnamed: 0_level_0,dpcapacity_start
from_station_name,Unnamed: 1_level_1
2112 W Peterson Ave,15.0
63rd St Beach,23.0
900 W Harrison,19.0
900 W Harrison St,19.0
Aberdeen St & Jackson Blvd,15.0


In [17]:
df_merge_all = pd.merge(departures_and_arrivals, max_cap, how='left', left_on='station_name',right_on='from_station_name')


In [18]:
df_merge_all = df_merge_all.groupby(['station_name','date'])[['cumulative_sum','dpcapacity_start']].max().reset_index()
df_merge_all.head()

Unnamed: 0,station_name,date,cumulative_sum,dpcapacity_start
0,2112 W Peterson Ave,2015-05-17,0,15.0
1,2112 W Peterson Ave,2015-05-18,0,15.0
2,2112 W Peterson Ave,2015-05-22,0,15.0
3,2112 W Peterson Ave,2015-05-23,-1,15.0
4,2112 W Peterson Ave,2015-05-25,-1,15.0


In [19]:
df_merge_all['difference'] = df_merge_all['dpcapacity_start'] - df_merge_all['cumulative_sum']

In [20]:
# show all stations that exceed capacity
df_negative = df_merge_all[df_merge_all['difference']<0]
df_negative.head()

Unnamed: 0,station_name,date,cumulative_sum,dpcapacity_start,difference
3982,Aberdeen St & Jackson Blvd,2017-09-25,16,15.0,-1.0
4254,Aberdeen St & Madison St,2014-07-02,20,19.0,-1.0
4320,Aberdeen St & Madison St,2014-09-06,32,19.0,-13.0
4321,Aberdeen St & Madison St,2014-09-07,23,19.0,-4.0
4322,Aberdeen St & Madison St,2014-09-08,24,19.0,-5.0


In [21]:
data['date'] = data['starttime'].dt.date

In [22]:
badstation = df_negative['station_name'].unique()

In [23]:
data = data[~data['from_station_name'].isin(badstation)]

In [24]:
data = data[~data['to_station_name'].isin(badstation)]

In [25]:
data.head()

Unnamed: 0,trip_id,year,month,week,day,hour,num_day_of_week,day_of_week,starttime,stoptime,tripduration,temperature,events,usertype,gender,from_station_id,from_station_name,latitude_start,longitude_start,dpcapacity_start,to_station_id,to_station_name,latitude_end,longitude_end,dpcapacity_end,date
576384,1109623,2014,1,1,1,11,2,Wednesday,2014-01-01 11:50:00,2014-01-01 11:59:00,9.0,21.0,rain or snow,Subscriber,Male,60,Dayton St & North Ave,41.910578,-87.649422,19.0,307,Southport Ave & Clybourn Ave,41.921004,-87.663257,15.0,2014-01-01
576375,1109752,2014,1,1,1,14,2,Wednesday,2014-01-01 14:48:00,2014-01-01 14:52:00,3.766667,21.0,rain or snow,Subscriber,Female,14,Morgan St & 18th St,41.858086,-87.651073,15.0,14,Morgan St & 18th St,41.858086,-87.651073,15.0,2014-01-01
576362,1109825,2014,1,1,1,16,2,Wednesday,2014-01-01 16:47:00,2014-01-01 16:53:00,6.433333,21.0,rain or snow,Subscriber,Male,306,Sheridan Rd & Buena Ave,41.9584,-87.65423,15.0,114,Sheffield Ave & Addison St,41.94688,-87.65445,27.0,2014-01-01
576355,1109882,2014,1,1,1,18,2,Wednesday,2014-01-01 18:14:00,2014-01-01 18:27:00,12.716667,21.9,rain or snow,Subscriber,Male,219,Damen Ave & Cortland St,41.916027,-87.677411,11.0,307,Southport Ave & Clybourn Ave,41.921004,-87.663257,15.0,2014-01-01
576354,1109884,2014,1,1,1,18,2,Wednesday,2014-01-01 18:15:00,2014-01-01 18:27:00,12.4,21.9,rain or snow,Subscriber,Female,219,Damen Ave & Cortland St,41.916027,-87.677411,11.0,307,Southport Ave & Clybourn Ave,41.921004,-87.663257,15.0,2014-01-01


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167971 entries, 576384 to 8911710
Data columns (total 26 columns):
trip_id              1167971 non-null int64
year                 1167971 non-null int64
month                1167971 non-null int64
week                 1167971 non-null int64
day                  1167971 non-null int64
hour                 1167971 non-null int64
num_day_of_week      1167971 non-null int64
day_of_week          1167971 non-null object
starttime            1167971 non-null datetime64[ns]
stoptime             1167971 non-null datetime64[ns]
tripduration         1167971 non-null float64
temperature          1167971 non-null float64
events               1167971 non-null object
usertype             1167971 non-null object
gender               1167971 non-null object
from_station_id      1167971 non-null int64
from_station_name    1167971 non-null object
latitude_start       1167971 non-null float64
longitude_start      1167971 non-null float64
dpcapacity_star

## Departure Joined Dataframe

In [27]:
dfmodel = data.copy()

In [28]:
dfmodel = pd.get_dummies(dfmodel, columns = ['events','usertype','gender'])


In [29]:
dfmodel.head()

Unnamed: 0,trip_id,year,month,week,day,hour,num_day_of_week,day_of_week,starttime,stoptime,tripduration,temperature,from_station_id,from_station_name,latitude_start,...,latitude_end,longitude_end,dpcapacity_end,date,events_clear,events_cloudy,events_not clear,events_rain or snow,events_tstorms,events_unknown,usertype_Customer,usertype_Dependent,usertype_Subscriber,gender_Female,gender_Male
576384,1109623,2014,1,1,1,11,2,Wednesday,2014-01-01 11:50:00,2014-01-01 11:59:00,9.0,21.0,60,Dayton St & North Ave,41.910578,...,41.921004,-87.663257,15.0,2014-01-01,0,0,0,1,0,0,0,0,1,0,1
576375,1109752,2014,1,1,1,14,2,Wednesday,2014-01-01 14:48:00,2014-01-01 14:52:00,3.766667,21.0,14,Morgan St & 18th St,41.858086,...,41.858086,-87.651073,15.0,2014-01-01,0,0,0,1,0,0,0,0,1,1,0
576362,1109825,2014,1,1,1,16,2,Wednesday,2014-01-01 16:47:00,2014-01-01 16:53:00,6.433333,21.0,306,Sheridan Rd & Buena Ave,41.9584,...,41.94688,-87.65445,27.0,2014-01-01,0,0,0,1,0,0,0,0,1,0,1
576355,1109882,2014,1,1,1,18,2,Wednesday,2014-01-01 18:14:00,2014-01-01 18:27:00,12.716667,21.9,219,Damen Ave & Cortland St,41.916027,...,41.921004,-87.663257,15.0,2014-01-01,0,0,0,1,0,0,0,0,1,0,1
576354,1109884,2014,1,1,1,18,2,Wednesday,2014-01-01 18:15:00,2014-01-01 18:27:00,12.4,21.9,219,Damen Ave & Cortland St,41.916027,...,41.921004,-87.663257,15.0,2014-01-01,0,0,0,1,0,0,0,0,1,1,0


In [30]:
dfdmodel = dfmodel.groupby(['from_station_name','year','month','day','hour','num_day_of_week']).agg({'tripduration':'mean',
                                                                        'temperature':'mean'}).reset_index()
dfdmodel.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature
0,2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0
1,2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0
2,2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1
3,2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1
4,2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0


#### event percentages

In [31]:
dfdevents = dfmodel.copy()
dfdevents = dfdevents[['from_station_name','year','month','day','hour','num_day_of_week','events_cloudy','events_clear','events_not clear','events_rain or snow','events_tstorms']]
dfdevents.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,events_cloudy,events_clear,events_not clear,events_rain or snow,events_tstorms
576384,Dayton St & North Ave,2014,1,1,11,2,0,0,0,1,0
576375,Morgan St & 18th St,2014,1,1,14,2,0,0,0,1,0
576362,Sheridan Rd & Buena Ave,2014,1,1,16,2,0,0,0,1,0
576355,Damen Ave & Cortland St,2014,1,1,18,2,0,0,0,1,0
576354,Damen Ave & Cortland St,2014,1,1,18,2,0,0,0,1,0


In [32]:
dfdevents['sum'] = (dfdevents['events_cloudy'] +
                   dfdevents['events_clear'] +
                   dfdevents['events_not clear'] +
                   dfdevents['events_rain or snow'] + 
                   dfdevents['events_tstorms'])

In [33]:
dfdevents = dfdevents.sort_values(['from_station_name','year','month','day','hour','num_day_of_week'])

In [34]:
dfdevents = dfdevents.groupby(['from_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfdevents.head();

In [35]:
dfdevents['events_cloudy_pct'] = (dfdevents['events_cloudy'] / dfdevents['sum'])*100
dfdevents['events_clear_pct'] = (dfdevents['events_clear'] / dfdevents['sum'])*100
dfdevents['events_not clear_pct'] = (dfdevents['events_not clear'] / dfdevents['sum'])*100
dfdevents['events_rain or snow_pct'] = (dfdevents['events_rain or snow'] / dfdevents['sum'])*100
dfdevents['events_tstorms_pct'] = (dfdevents['events_tstorms'] / dfdevents['sum'])*100

In [36]:
dfdevents_pct = dfdevents[['from_station_name','year','month','day','hour','num_day_of_week','events_cloudy_pct','events_clear_pct','events_not clear_pct','events_rain or snow_pct','events_tstorms_pct']]
dfdevents_pct.head();

#### usertype percentage

In [37]:
dfdusertype = dfmodel.copy()
dfdusertype = dfdusertype[['from_station_name','year','month','day','hour','num_day_of_week','usertype_Customer','usertype_Dependent','usertype_Subscriber']]
dfdusertype.head();

In [38]:
dfdusertype['sum'] = (dfdusertype['usertype_Customer'] +
                   dfdusertype['usertype_Dependent'] +
                   dfdusertype['usertype_Subscriber'])

In [39]:
dfdusertype = dfdusertype.sort_values(['from_station_name','year','month','day','hour','num_day_of_week'])

In [40]:
dfdusertype = dfdusertype.groupby(['from_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfdusertype.head();

In [41]:
dfdusertype['usertype_Customer_pct'] = (dfdusertype['usertype_Customer'] / dfdusertype['sum'])*100
dfdusertype['usertype_Dependent_pct'] = (dfdusertype['usertype_Dependent'] / dfdusertype['sum'])*100
dfdusertype['usertype_Subscriber_pct'] = (dfdusertype['usertype_Subscriber'] / dfdusertype['sum'])*100

In [42]:
dfdusertype_pct = dfdusertype[['from_station_name','year','month','day','hour','num_day_of_week','usertype_Customer_pct','usertype_Dependent_pct','usertype_Subscriber_pct']]
dfdusertype_pct.head();

#### gender percentages

In [43]:
dfdgender = dfmodel.copy()
dfdgender = dfdgender[['from_station_name','year','month','day','hour','num_day_of_week','gender_Female','gender_Male']]
dfdgender.head();

In [44]:
dfdgender['sum'] = (dfdgender['gender_Female'] + dfdgender['gender_Male'])

In [45]:
dfdgender = dfdgender.sort_values(['from_station_name','year','month','day','hour','num_day_of_week'])

In [46]:
dfdgender = dfdgender.groupby(['from_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfdgender.head();

In [47]:
dfdgender['gender_Female_pct'] = (dfdgender['gender_Female'] / dfdgender['sum'])*100
dfdgender['gender_Male_pct'] = (dfdgender['gender_Male'] / dfdgender['sum'])*100

In [48]:
dfdgender_pct = dfdgender[['from_station_name','year','month','day','hour','num_day_of_week','gender_Female_pct','gender_Male_pct','sum']]
dfdgender_pct.head();

### Join

In [49]:
joindf = pd.merge(dfdmodel, dfdevents_pct, how='left',on=['from_station_name','year','month','day','hour','num_day_of_week'])

In [50]:
joindf = pd.merge(joindf, dfdusertype_pct, how='left', on=['from_station_name','year','month','day','hour','num_day_of_week'])

In [51]:
departure_joindf = pd.merge(joindf, dfdgender_pct, how='left', on=['from_station_name','year','month','day','hour','num_day_of_week'])

In [52]:
departure_joindf = departure_joindf.rename(columns={'sum':'trip_count'})

In [53]:
departure_joindf['year'] = departure_joindf['year'].astype(str)
departure_joindf['month'] = departure_joindf['month'].astype(str)
departure_joindf['day'] = departure_joindf['day'].astype(str)

In [54]:
departure_joindf = departure_joindf.set_index('from_station_name')

In [55]:
departure_joindf = departure_joindf.fillna(0)

In [56]:
departure_joindf.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [57]:
departure_joindf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 863687 entries, 2112 W Peterson Ave to Yates Blvd & 75th St
Data columns (total 18 columns):
year                       863687 non-null object
month                      863687 non-null object
day                        863687 non-null object
hour                       863687 non-null int64
num_day_of_week            863687 non-null int64
tripduration               863687 non-null float64
temperature                863687 non-null float64
events_cloudy_pct          863687 non-null float64
events_clear_pct           863687 non-null float64
events_not clear_pct       863687 non-null float64
events_rain or snow_pct    863687 non-null float64
events_tstorms_pct         863687 non-null float64
usertype_Customer_pct      863687 non-null float64
usertype_Dependent_pct     863687 non-null float64
usertype_Subscriber_pct    863687 non-null float64
gender_Female_pct          863687 non-null float64
gender_Male_pct            863687 non-null float64
tr

## Arrival Joined Dataframe

In [58]:
dfmodel_end = dfmodel.drop(columns=['year','month','day','hour'])

In [59]:
dfmodel_end['year'] = dfmodel_end.stoptime.dt.year
dfmodel_end['month'] = dfmodel_end.stoptime.dt.month
dfmodel_end['day'] = dfmodel_end.stoptime.dt.day
dfmodel_end['hour'] = dfmodel_end.stoptime.dt.hour
dfmodel_end['num_day_of_week'] = dfmodel_end.stoptime.dt.weekday

In [60]:
dfamodel = dfmodel_end.groupby(['to_station_name','year','month','day','hour','num_day_of_week']).agg({'tripduration':'mean',
                                                                                  'temperature':'mean'}).reset_index()
dfamodel.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature
0,2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0
1,2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0
2,2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1
3,2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9
4,2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1


In [61]:
dfamodel.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861309 entries, 0 to 861308
Data columns (total 8 columns):
to_station_name    861309 non-null object
year               861309 non-null int64
month              861309 non-null int64
day                861309 non-null int64
hour               861309 non-null int64
num_day_of_week    861309 non-null int64
tripduration       861309 non-null float64
temperature        861309 non-null float64
dtypes: float64(2), int64(5), object(1)
memory usage: 52.6+ MB


#### event percentages

In [62]:
dfaevents = dfmodel_end.copy()
dfaevents = dfaevents[['to_station_name','year','month','day','hour','num_day_of_week','events_cloudy','events_clear','events_not clear','events_rain or snow','events_tstorms']]
dfaevents.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,events_cloudy,events_clear,events_not clear,events_rain or snow,events_tstorms
576384,Southport Ave & Clybourn Ave,2014,1,1,11,2,0,0,0,1,0
576375,Morgan St & 18th St,2014,1,1,14,2,0,0,0,1,0
576362,Sheffield Ave & Addison St,2014,1,1,16,2,0,0,0,1,0
576355,Southport Ave & Clybourn Ave,2014,1,1,18,2,0,0,0,1,0
576354,Southport Ave & Clybourn Ave,2014,1,1,18,2,0,0,0,1,0


In [63]:
dfaevents['sum'] = (dfaevents['events_cloudy'] +
                   dfaevents['events_clear'] +
                   dfaevents['events_not clear'] +
                   dfaevents['events_rain or snow'] + 
                   dfaevents['events_tstorms'])

In [64]:
dfaevents = dfaevents.sort_values(['to_station_name','year','month','day','hour','num_day_of_week'])

In [65]:
dfaevents = dfaevents.groupby(['to_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfaevents.head();

In [66]:
dfaevents['events_cloudy_pct'] = (dfaevents['events_cloudy'] / dfaevents['sum'])*100
dfaevents['events_clear_pct'] = (dfaevents['events_clear'] / dfaevents['sum'])*100
dfaevents['events_not clear_pct'] = (dfaevents['events_not clear'] / dfaevents['sum'])*100
dfaevents['events_rain or snow_pct'] = (dfaevents['events_rain or snow'] / dfaevents['sum'])*100
dfaevents['events_tstorms_pct'] = (dfaevents['events_tstorms'] / dfaevents['sum'])*100

In [67]:
dfaevents_pct = dfaevents[['to_station_name','year','month','day','hour','num_day_of_week','events_cloudy_pct','events_clear_pct','events_not clear_pct','events_rain or snow_pct','events_tstorms_pct']]
dfaevents_pct.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct
0,2112 W Peterson Ave,2015,5,17,13,6,100.0,0.0,0.0,0.0,0.0
1,2112 W Peterson Ave,2015,5,18,10,0,100.0,0.0,0.0,0.0,0.0
2,2112 W Peterson Ave,2015,5,22,16,4,100.0,0.0,0.0,0.0,0.0
3,2112 W Peterson Ave,2015,5,23,12,5,100.0,0.0,0.0,0.0,0.0
4,2112 W Peterson Ave,2015,5,25,9,0,0.0,0.0,0.0,100.0,0.0


#### usertype percentage

In [68]:
dfausertype = dfmodel_end.copy()
dfausertype = dfausertype[['to_station_name','year','month','day','hour','num_day_of_week','usertype_Customer','usertype_Dependent','usertype_Subscriber']]
dfausertype.head();

In [69]:
dfausertype['sum'] = (dfausertype['usertype_Customer'] +
                   dfausertype['usertype_Dependent'] +
                   dfausertype['usertype_Subscriber'])

In [70]:
dfausertype = dfausertype.sort_values(['to_station_name','year','month','day','hour','num_day_of_week'])

In [71]:
dfausertype = dfausertype.groupby(['to_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfausertype.head();

In [72]:
dfausertype['usertype_Customer_pct'] = (dfausertype['usertype_Customer'] / dfausertype['sum'])*100
dfausertype['usertype_Dependent_pct'] = (dfausertype['usertype_Dependent'] / dfausertype['sum'])*100
dfausertype['usertype_Subscriber_pct'] = (dfausertype['usertype_Subscriber'] / dfausertype['sum'])*100

In [73]:
dfausertype_pct = dfausertype[['to_station_name','year','month','day','hour','num_day_of_week','usertype_Customer_pct','usertype_Dependent_pct','usertype_Subscriber_pct']]
dfausertype_pct.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct
0,2112 W Peterson Ave,2015,5,17,13,6,0.0,0.0,100.0
1,2112 W Peterson Ave,2015,5,18,10,0,0.0,0.0,100.0
2,2112 W Peterson Ave,2015,5,22,16,4,0.0,0.0,100.0
3,2112 W Peterson Ave,2015,5,23,12,5,0.0,0.0,100.0
4,2112 W Peterson Ave,2015,5,25,9,0,0.0,0.0,100.0


#### gender percentages

In [74]:
dfagender = dfmodel_end.copy()
dfagender = dfagender[['to_station_name','year','month','day','hour','num_day_of_week','gender_Female','gender_Male']]
dfagender.head();

In [75]:
dfagender['sum'] = (dfagender['gender_Female'] +dfagender['gender_Male'])

In [76]:
dfagender = dfagender.sort_values(['to_station_name','year','month','day','hour','num_day_of_week'])

In [77]:
dfagender = dfagender.groupby(['to_station_name','year','month','day','hour','num_day_of_week']).sum().reset_index()
dfagender.head();

In [78]:
dfagender['gender_Female_pct'] = (dfagender['gender_Female'] / dfagender['sum'])*100
dfagender['gender_Male_pct'] = (dfagender['gender_Male'] / dfagender['sum'])*100

In [79]:
dfagender_pct = dfagender[['to_station_name','year','month','day','hour','num_day_of_week','gender_Female_pct','gender_Male_pct','sum']]
dfagender_pct.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,gender_Female_pct,gender_Male_pct,sum
0,2112 W Peterson Ave,2015,5,17,13,6,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,16,4,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,23,12,5,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,25,9,0,100.0,0.0,1


### Join

In [80]:
joindf = pd.merge(dfamodel, dfaevents_pct, how='left',on=['to_station_name','year','month','day','hour','num_day_of_week'])

In [81]:
joindf = pd.merge(joindf, dfausertype_pct, how='left', on=['to_station_name','year','month','day','hour','num_day_of_week'])

In [82]:
arrival_joindf = pd.merge(joindf, dfagender_pct, how='left', on=['to_station_name','year','month','day','hour','num_day_of_week'])

In [83]:
arrival_joindf = arrival_joindf.rename(columns={'sum':'trip_count'})

In [84]:
arrival_joindf['year'] = arrival_joindf['year'].astype(str)
arrival_joindf['month'] = arrival_joindf['month'].astype(str)
arrival_joindf['day'] = arrival_joindf['day'].astype(str)

In [85]:
arrival_joindf = arrival_joindf.set_index('to_station_name')

In [86]:
arrival_joindf = arrival_joindf.fillna(0)

In [87]:
arrival_joindf.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [88]:
arrival_joindf.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 861309 entries, 2112 W Peterson Ave to Yates Blvd & 75th St
Data columns (total 18 columns):
year                       861309 non-null object
month                      861309 non-null object
day                        861309 non-null object
hour                       861309 non-null int64
num_day_of_week            861309 non-null int64
tripduration               861309 non-null float64
temperature                861309 non-null float64
events_cloudy_pct          861309 non-null float64
events_clear_pct           861309 non-null float64
events_not clear_pct       861309 non-null float64
events_rain or snow_pct    861309 non-null float64
events_tstorms_pct         861309 non-null float64
usertype_Customer_pct      861309 non-null float64
usertype_Dependent_pct     861309 non-null float64
usertype_Subscriber_pct    861309 non-null float64
gender_Female_pct          861309 non-null float64
gender_Male_pct            861309 non-null float64
tr

# Prepare for Modeling

In [89]:
dfdmodel = departure_joindf.copy()
dfdmodel.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [90]:
dfdmodel = dfdmodel.reset_index()

In [91]:
yr2014 = dfdmodel[dfdmodel['year']=='2014']
yr2015 = dfdmodel[dfdmodel['year']=='2015']
yr2016 = dfdmodel[dfdmodel['year']=='2016']
yr2017 = dfdmodel[dfdmodel['year']=='2017']

In [92]:
station2014 = yr2014['from_station_name'].unique().tolist()
station2015 = yr2015['from_station_name'].unique().tolist()
station2016 = yr2016['from_station_name'].unique().tolist()
station2017 = yr2017['from_station_name'].unique().tolist()

In [93]:
dfdmodel = dfdmodel[dfdmodel['from_station_name'].isin(station2017)]
dfdmodel = dfdmodel[dfdmodel['from_station_name'].isin(station2016)]
dfdmodel = dfdmodel[dfdmodel['from_station_name'].isin(station2015)]
dfdmodel = dfdmodel[dfdmodel['from_station_name'].isin(station2014)]


In [94]:
# create key to search the joindf
dfmodelstations = dfdmodel['from_station_name'].unique() 

In [95]:
dfdmodel = dfdmodel.set_index('from_station_name')

In [96]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in dfmodelstations:
    
    X_train = dfdmodel.loc[k][dfdmodel.loc[k]['year'].isin(['2014','2015','2016'])].drop(['trip_count'], axis=1).values
    X_test = dfdmodel.loc[k][dfdmodel.loc[k]['year']=='2017'].drop(['trip_count'], axis=1).values
    y_train = dfdmodel.loc[k][dfdmodel.loc[k]['year'].isin(['2014','2015','2016'])]['trip_count'].values
    y_test = dfdmodel.loc[k][dfdmodel.loc[k]['year']=='2017']['trip_count'].values
      
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [97]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

-6052770611863.721

In [98]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

4744.157129629633

In [99]:
depdf = pd.DataFrame({'station_name': dfmodelstations,
                      'r-squared': lmdeparture_rsquared,
                      'mean absolute error': lmdeparture_mae
                     })

In [100]:
depdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 3 columns):
station_name           108 non-null object
r-squared              108 non-null float64
mean absolute error    108 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.6+ KB


In [101]:
depdf[(depdf['r-squared']>= -1) & (depdf['r-squared']<= 1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
0,Adler Planetarium,0.030881,0.9
1,Ashland Ave & 13th St,0.01726,0.31
2,Ashland Ave & 21st St,-0.105971,0.35
3,Ashland Ave & Lake St,0.029425,0.58
4,Ashland Ave & Wellington Ave,0.031505,0.25
5,Blackstone Ave & Hyde Park Blvd,0.01491,0.44
6,Broadway & Argyle St,0.012098,0.55
8,Broadway & Wilson Ave,0.030094,0.57
9,California Ave & 21st St,-0.014015,0.23
10,California Ave & Division St,0.0364,0.26


In [102]:
bad_stations = depdf[(depdf['r-squared']> 1) | (depdf['r-squared']< -1)]

In [103]:
bad_stations

Unnamed: 0,station_name,r-squared,mean absolute error
7,Broadway & Berwyn Ave,-29587660000.0,3928.86
28,Damen Ave & Cortland St,-653669600000000.0,508393.79


In [104]:
dfdmodel.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Adler Planetarium,2014,1,13,13,0,18.05,43.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
Adler Planetarium,2014,1,26,14,6,17.083333,30.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
Adler Planetarium,2014,2,8,14,5,25.033333,15.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,0.0,100.0,1
Adler Planetarium,2014,2,10,15,0,26.5,9.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
Adler Planetarium,2014,2,15,15,5,13.975,21.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,50.0,50.0,2


In [105]:
dfdmodel = dfdmodel.reset_index()

In [106]:
dfdmodel = dfdmodel[~dfdmodel['from_station_name'].isin(['Broadway & Berwyn Ave','Damen Ave & Cortland St'])]


In [107]:
dfdmodel.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,Adler Planetarium,2014,1,13,13,0,18.05,43.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
1,Adler Planetarium,2014,1,26,14,6,17.083333,30.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2,Adler Planetarium,2014,2,8,14,5,25.033333,15.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,Adler Planetarium,2014,2,10,15,0,26.5,9.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,Adler Planetarium,2014,2,15,15,5,13.975,21.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,50.0,50.0,2


In [108]:
# create key to search the joindf
dfmodelstations = dfdmodel['from_station_name'].unique() 

In [109]:
dfdmodel = dfdmodel.set_index('from_station_name')

In [110]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in dfmodelstations:
    
    X_train = dfdmodel.loc[k][dfdmodel.loc[k]['year'].isin(['2014','2015','2016'])].drop(['trip_count'], axis=1).values
    X_test = dfdmodel.loc[k][dfdmodel.loc[k]['year']=='2017'].drop(['trip_count'], axis=1).values
    y_train = dfdmodel.loc[k][dfdmodel.loc[k]['year'].isin(['2014','2015','2016'])]['trip_count'].values
    y_test = dfdmodel.loc[k][dfdmodel.loc[k]['year']=='2017']['trip_count'].values
      
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [111]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

0.011908044072531586

In [112]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

0.43698113207547185

In [113]:
dfamodel = arrival_joindf.copy()
dfamodel.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [114]:
dfamodel.info()

<class 'pandas.core.frame.DataFrame'>
Index: 861309 entries, 2112 W Peterson Ave to Yates Blvd & 75th St
Data columns (total 18 columns):
year                       861309 non-null object
month                      861309 non-null object
day                        861309 non-null object
hour                       861309 non-null int64
num_day_of_week            861309 non-null int64
tripduration               861309 non-null float64
temperature                861309 non-null float64
events_cloudy_pct          861309 non-null float64
events_clear_pct           861309 non-null float64
events_not clear_pct       861309 non-null float64
events_rain or snow_pct    861309 non-null float64
events_tstorms_pct         861309 non-null float64
usertype_Customer_pct      861309 non-null float64
usertype_Dependent_pct     861309 non-null float64
usertype_Subscriber_pct    861309 non-null float64
gender_Female_pct          861309 non-null float64
gender_Male_pct            861309 non-null float64
tr

In [115]:
dfamodel = dfamodel.reset_index()

In [116]:
yr2014 = dfamodel[dfamodel['year']=='2014']
yr2015 = dfamodel[dfamodel['year']=='2015']
yr2016 = dfamodel[dfamodel['year']=='2016']
yr2017 = dfamodel[dfamodel['year']=='2017']

In [117]:
station2014 = yr2014['to_station_name'].unique().tolist()
station2015 = yr2015['to_station_name'].unique().tolist()
station2016 = yr2016['to_station_name'].unique().tolist()
station2017 = yr2017['to_station_name'].unique().tolist()

In [118]:
dfamodel = dfamodel[dfamodel['to_station_name'].isin(station2017)]
dfamodel = dfamodel[dfamodel['to_station_name'].isin(station2016)]
dfamodel = dfamodel[dfamodel['to_station_name'].isin(station2015)]
dfamodel = dfamodel[dfamodel['to_station_name'].isin(station2014)]


In [119]:
# create key to search the joindf
dfmodelstations = dfamodel['to_station_name'].unique() 

In [120]:
dfamodel = dfamodel.set_index('to_station_name')

In [121]:
lmarrival_rsquared = []
lmarrival_mae = []

for k in dfmodelstations:
    
    X_train = dfamodel.loc[k][dfamodel.loc[k]['year'].isin(['2014','2015','2016'])].drop(['trip_count'], axis=1).values
    X_test = dfamodel.loc[k][dfamodel.loc[k]['year']=='2017'].drop(['trip_count'], axis=1).values
    y_train = dfamodel.loc[k][dfamodel.loc[k]['year'].isin(['2014','2015','2016'])]['trip_count'].values
    y_test = dfamodel.loc[k][dfamodel.loc[k]['year']=='2017']['trip_count'].values
      
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))
    

In [122]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

-2.185652874728572e+21

In [123]:
sum(lmarrival_mae)/len(lmarrival_mae)

121542826.56037037

In [124]:
arrdf = pd.DataFrame({'station_name': dfmodelstations,
                      'r-squared': lmarrival_rsquared,
                      'mean absolute error': lmarrival_mae
                     })

In [125]:
arrdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 3 columns):
station_name           108 non-null object
r-squared              108 non-null float64
mean absolute error    108 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.6+ KB


In [126]:
arrdf[(arrdf['r-squared']>= -1) & (arrdf['r-squared']<= 1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
0,Adler Planetarium,0.010176,0.87
1,Ashland Ave & 13th St,0.015909,0.32
2,Ashland Ave & 21st St,-0.064206,0.27
3,Ashland Ave & Lake St,-0.254343,0.64
4,Ashland Ave & Wellington Ave,0.026468,0.25
5,Blackstone Ave & Hyde Park Blvd,0.05605,0.43
6,Broadway & Argyle St,-0.011237,0.63
7,Broadway & Berwyn Ave,0.035809,0.73
8,Broadway & Wilson Ave,-0.080638,0.54
9,California Ave & 21st St,-0.037896,0.21


In [127]:
bad_stations = arrdf[(arrdf['r-squared']> 1) | (arrdf['r-squared']< -1)]


In [128]:
bad_stations

Unnamed: 0,station_name,r-squared,mean absolute error
22,Clinton St & Roosevelt Rd,-8.764176e+19,175477000.0
41,Green St & Randolph St,-11388670000000.0,97375.25
51,Halsted St & Willow St,-10234090000.0,1731.31
66,McCormick Place,-2.359629e+23,12951050000.0


In [129]:
bad_station = bad_stations['station_name'].tolist()

In [130]:
dfamodel.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
to_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Adler Planetarium,2014,1,11,14,5,18.116667,34.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
Adler Planetarium,2014,1,13,13,0,12.0,43.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,2
Adler Planetarium,2014,3,1,18,5,37.475,14.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,50.0,50.0,2
Adler Planetarium,2014,3,2,9,6,8.966667,9.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
Adler Planetarium,2014,3,10,14,0,19.9,53.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [131]:
dfamodel = dfamodel.reset_index()

In [132]:
dfamodel = dfamodel[~dfamodel['to_station_name'].isin(bad_station)]


In [133]:
dfamodel.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,Adler Planetarium,2014,1,11,14,5,18.116667,34.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
1,Adler Planetarium,2014,1,13,13,0,12.0,43.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,2
2,Adler Planetarium,2014,3,1,18,5,37.475,14.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,50.0,50.0,2
3,Adler Planetarium,2014,3,2,9,6,8.966667,9.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,Adler Planetarium,2014,3,10,14,0,19.9,53.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [134]:
# create key to search the joindf
dfmodelstations = dfamodel['to_station_name'].unique() 

In [135]:
dfamodel = dfamodel.set_index('to_station_name')

In [136]:
lmarrival_rsquared = []
lmarrival_mae = []

for k in dfmodelstations:
    
    X_train = dfamodel.loc[k][dfamodel.loc[k]['year'].isin(['2014','2015','2016'])].drop(['trip_count'], axis=1).values
    X_test = dfamodel.loc[k][dfamodel.loc[k]['year']=='2017'].drop(['trip_count'], axis=1).values
    y_train = dfamodel.loc[k][dfamodel.loc[k]['year'].isin(['2014','2015','2016'])]['trip_count'].values
    y_test = dfamodel.loc[k][dfamodel.loc[k]['year']=='2017']['trip_count'].values
      
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))
    

In [137]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

0.01307169272508543

In [138]:
sum(lmarrival_mae)/len(lmarrival_mae)

0.44192307692307686

# Save as CSV

In [139]:
dfdmodel.to_csv('dfdmodel.csv')

In [140]:
dfamodel.to_csv('dfamodel.csv')