In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
import collections
%matplotlib inline

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot

# Year train-test split

In [2]:
# read in saved csv files - departure
pd.set_option('display.max_columns', 30)
filename = 'departure_joindf.csv'
departure_joindf = pd.read_csv(filename)
departure_joindf.head()

Unnamed: 0,from_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,events_unknown_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [3]:
train = departure_joindf[departure_joindf['year'].isin([2014,2015,2016])]

In [4]:
test = departure_joindf[departure_joindf['year']==2017]

In [5]:
trainlist = train['from_station_name'].unique().tolist()

In [6]:
testlist = test['from_station_name'].unique().tolist()

In [7]:
joinedlist = set(trainlist) & set(testlist)

In [8]:
departure_joindf_inclusive = departure_joindf[departure_joindf['from_station_name'].isin(joinedlist)]

In [9]:
# read in saved csv files - arrival
pd.set_option('display.max_columns', 30)
filename = 'arrival_joindf.csv'
arrival_joindf = pd.read_csv(filename)
arrival_joindf.head()

Unnamed: 0,to_station_name,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,events_unknown_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,13,6,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,0,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,16,4,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,23,12,5,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,25,9,0,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [10]:
train = arrival_joindf[arrival_joindf['year'].isin([2014,2015,2016])]

In [11]:
test = arrival_joindf[arrival_joindf['year']==2017]

In [12]:
trainlist = train['to_station_name'].unique().tolist()
len(trainlist)

635

In [13]:
testlist = test['to_station_name'].unique().tolist()
len(testlist)

592

In [14]:
joinedlist = set(trainlist) & set(testlist)
len(joinedlist)

571

In [15]:
arrival_joindf_inclusive = arrival_joindf[arrival_joindf['to_station_name'].isin(joinedlist)]

# Train-test split years

## Linear Model - departure

In [16]:
# create key to search the joindf
depstations = departure_joindf_inclusive['from_station_name'].unique() 

In [17]:
departure_joindf_inclusive = departure_joindf_inclusive.set_index('from_station_name')

In [18]:
departure_joindf_inclusive.head()

Unnamed: 0_level_0,year,month,day,hour,num_day_of_week,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,events_unknown_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2112 W Peterson Ave,2015,5,17,14,6,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,0,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,4,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,2,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,2,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [19]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in depstations:
    
    X_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [20]:
depdf = pd.DataFrame({'station_name': depstations,
                      'r-squared': lmdeparture_rsquared,
                      'mean absolute error': lmdeparture_mae
                     })

In [21]:
depdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 3 columns):
station_name           571 non-null object
r-squared              571 non-null float64
mean absolute error    571 non-null float64
dtypes: float64(2), object(1)
memory usage: 13.5+ KB


In [22]:
depdf[(depdf['r-squared']>= -1) & (depdf['r-squared']<= 1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
0,2112 W Peterson Ave,-0.030151,0.11
1,63rd St Beach,0.021783,0.4
2,900 W Harrison St,0.092323,0.99
3,Aberdeen St & Jackson Blvd,0.04272,1.16
4,Aberdeen St & Monroe St,0.073684,1.39
5,Ada St & Washington Blvd,0.050916,1.18
6,Adler Planetarium,0.061049,1.29
7,Albany (Kedzie) Ave & Montrose Ave,-0.019352,0.31
8,Albany Ave & 26th St,-0.027434,0.14
9,Albany Ave & Bloomingdale Ave,0.027592,0.71


In [23]:
depdf[(depdf['r-squared']> 1) | (depdf['r-squared']< -1)].head(10)

Unnamed: 0,station_name,r-squared,mean absolute error
13,Ashland Ave & 50th St,-5.972346,0.42
33,Austin Blvd & Lake St,-14.740363,0.47
72,Calumet Ave & 71st St,-2.097299,0.3
91,Central Park Blvd & 5th Ave,-1.781696,0.16
98,Cicero Ave & Lake St,-1.464733,0.53
132,Clinton St & Jackson Blvd,-1.806284,10.42
150,Cottage Grove Ave & 78th St,-157.178995,1.25
151,Cottage Grove Ave & 83rd St,-27.239258,0.45
207,Fairfield Ave & Roosevelt Rd,-1.424826,0.43
229,Greenwood Ave & 79th St,-40.198997,1.75


In [24]:
X_train = departure_joindf_inclusive.loc['Ada St & Washington Blvd'][departure_joindf_inclusive.loc['Ada St & Washington Blvd']['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
X_test = departure_joindf_inclusive.loc['Ada St & Washington Blvd'][departure_joindf_inclusive.loc['Ada St & Washington Blvd']['year']==2017].drop(['trip_count'], axis=1).values
y_train = departure_joindf_inclusive.loc['Ada St & Washington Blvd'][departure_joindf_inclusive.loc['Ada St & Washington Blvd']['year'].isin([2014,2015,2016])]['trip_count'].values
y_test = departure_joindf_inclusive.loc['Ada St & Washington Blvd'][departure_joindf_inclusive.loc['Ada St & Washington Blvd']['year']==2017]['trip_count'].values


lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
errors = abs(y_pred - y_test)

In [25]:
(lm.score(X_test, y_test))

0.050916327708307896

In [26]:
(round(np.mean(errors), 2))

1.18

In [27]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

-4.562831215110783e+20

In [28]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

88783276.14042035