In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
import collections
%matplotlib inline

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot

# SGD

In [2]:
# read in saved csv files 
pd.set_option('display.max_columns', 30)
filename = 'departure_joindf.csv'
departure_joindf = pd.read_csv(filename)
departure_joindf.head()

Unnamed: 0,from_station_name,year,month,day,hour,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,14,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,17,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,27,15,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,27,18,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [3]:
data1 = pd.get_dummies(departure_joindf, 'from_station_name')

In [None]:
X = data1.drop(['trip_count'],axis=1).values
y = data1['trip_count'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# SGD regressor
clf = linear_model.SGDRegressor(loss = "squared_loss", average=True)
clf.fit(X, y)

# Train-test split function 

## Linear Model - departure

In [2]:
# read in saved csv files 
pd.set_option('display.max_columns', 30)
filename = 'departure_joindf.csv'
departure_joindf = pd.read_csv(filename)
departure_joindf.head()

Unnamed: 0,from_station_name,year,month,day,hour,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,14,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,17,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,27,15,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,27,18,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [3]:
# create key to search the joindf
depstations = departure_joindf['from_station_name'].unique() 

In [4]:
departure_joindf = departure_joindf.set_index('from_station_name')

In [5]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in depstations:
    
    X = departure_joindf.loc[k].drop(['trip_count'],axis=1).values
    y = departure_joindf.loc[k]['trip_count'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [6]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

-4.6641096612474247e+21

In [7]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

689354442.6444508

## Linear Model - arrival

In [8]:
# read in saved csv files 
pd.set_option('display.max_columns', 30)
filename = 'arrival_joindf.csv'
arrival_joindf = pd.read_csv(filename)
arrival_joindf.head()

Unnamed: 0,to_station_name,year,month,day,hour,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
0,2112 W Peterson Ave,2015,5,17,13,25.916667,77.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
1,2112 W Peterson Ave,2015,5,18,10,9.6,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2,2112 W Peterson Ave,2015,5,22,16,11.85,62.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
3,2112 W Peterson Ave,2015,5,23,12,13.333333,75.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
4,2112 W Peterson Ave,2015,5,25,9,18.3,69.1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,100.0,0.0,1


In [9]:
# create key to search the joindf
arrstations = arrival_joindf['to_station_name'].unique() 

In [10]:
arrival_joindf = arrival_joindf.set_index('to_station_name')

In [11]:
lmarrival_rsquared = []
lmarrival_mae = []
for k in arrstations:
    
    X = arrival_joindf.loc[k].drop(['trip_count'],axis=1).values
    y = arrival_joindf.loc[k]['trip_count'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))

In [12]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

-4.152371678247943e+22

In [13]:
sum(lmarrival_mae)/len(lmarrival_mae)

808320177.8772569

## RF Model - departure

In [14]:
departure_joindf.head()

Unnamed: 0_level_0,year,month,day,hour,tripduration,temperature,events_cloudy_pct,events_clear_pct,events_not clear_pct,events_rain or snow_pct,events_tstorms_pct,usertype_Customer_pct,usertype_Dependent_pct,usertype_Subscriber_pct,gender_Female_pct,gender_Male_pct,trip_count
from_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2112 W Peterson Ave,2015,5,17,14,16.55,81.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,1
2112 W Peterson Ave,2015,5,18,10,4.166667,75.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,22,17,13.616667,60.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,15,14.3,71.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1
2112 W Peterson Ave,2015,5,27,18,15.45,72.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,1


In [15]:
rfdeparture_rsquared = []
rfdeparture_mae = []
for k in depstations:
    
    X = departure_joindf.loc[k].drop(['trip_count'],axis=1).values
    y = departure_joindf.loc[k]['trip_count'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfdeparture_rsquared.append(rf.score(X_test, y_test))
    
    rfdeparture_mae.append(round(np.mean(errors), 2))

KeyboardInterrupt: 

In [None]:
sum(rfdeparture_rsquared)/len(rfdeparture_rsquared)

In [None]:
sum(rfdeparture_mae)/len(rfdeparture_mae)

## RF Model - arrival

In [None]:
rfarrival_rsquared = []
rfarrival_mae = []

for k in arrstations:
    
    X = arrival_joindf.loc[k].drop(['trip_count'],axis=1).values
    y = arrival_joindf.loc[k]['trip_count'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfarrival_rsquared.append(rf.score(X_test, y_test))
    
    rfarrival_mae.append(round(np.mean(errors), 2))

In [None]:
sum(rfarrival_rsquared)/len(rfarrival_rsquared)

In [None]:
sum(rfarrival_mae)/len(rfarrival_mae)

# Year train-test split

In [None]:
# read in saved csv files - departure
pd.set_option('display.max_columns', 30)
filename = 'departure_joindf.csv'
departure_joindf = pd.read_csv(filename)
departure_joindf.head()

In [None]:
train = departure_joindf[departure_joindf['year'].isin([2014,2015,2016])]

In [None]:
test = departure_joindf[departure_joindf['year']==2017]

In [None]:
trainlist = train['from_station_name'].unique().tolist()

In [None]:
testlist = test['from_station_name'].unique().tolist()

In [None]:
joinedlist = set(trainlist) & set(testlist)

In [None]:
departure_joindf_inclusive = departure_joindf[departure_joindf['from_station_name'].isin(joinedlist)]

In [None]:
# read in saved csv files - arrival
pd.set_option('display.max_columns', 30)
filename = 'arrival_joindf.csv'
arrival_joindf = pd.read_csv(filename)
arrival_joindf.head()

In [None]:
train = arrival_joindf[arrival_joindf['year'].isin([2014,2015,2016])]

In [None]:
test = arrival_joindf[arrival_joindf['year']==2017]

In [None]:
trainlist = train['to_station_name'].unique().tolist()
len(trainlist)

In [None]:
testlist = test['to_station_name'].unique().tolist()
len(testlist)

In [None]:
joinedlist = set(trainlist) & set(testlist)
len(joinedlist)

In [None]:
arrival_joindf_inclusive = arrival_joindf[arrival_joindf['to_station_name'].isin(joinedlist)]

# Train-test split years

## Linear Model - departure

In [None]:
# create key to search the joindf
depstations = departure_joindf_inclusive['from_station_name'].unique() 

In [None]:
departure_joindf_inclusive = departure_joindf_inclusive.set_index('from_station_name')

In [None]:
departure_joindf_inclusive.head()

In [None]:
lmdeparture_rsquared = []
lmdeparture_mae = []

for k in depstations:
    
    X_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmdeparture_rsquared.append(lm.score(X_test, y_test))
    
    lmdeparture_mae.append(round(np.mean(errors), 2))

In [None]:
sum(lmdeparture_rsquared)/len(lmdeparture_rsquared)

In [None]:
sum(lmdeparture_mae)/len(lmdeparture_mae)

## Linear Model - arrival

In [None]:
# create key to search the joindf
arrstations = arrival_joindf_inclusive['to_station_name'].unique() 

In [None]:
arrival_joindf_inclusive = arrival_joindf_inclusive.set_index('to_station_name')

In [None]:
lmarrival_rsquared = []
lmarrival_mae = []
for k in arrstations:
    
    X_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    
    y_pred = lm.predict(X_test)
    
    errors = abs(y_pred - y_test)

    lmarrival_rsquared.append(lm.score(X_test, y_test))
    
    lmarrival_mae.append(round(np.mean(errors), 2))

In [None]:
sum(lmarrival_rsquared)/len(lmarrival_rsquared)

In [None]:
sum(lmarrival_mae)/len(lmarrival_mae)

## RF Model - departure

In [None]:
rfdeparture_rsquared = []
rfdeparture_mae = []
for k in depstations:
    
    X_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = departure_joindf_inclusive.loc[k][departure_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfdeparture_rsquared.append(rf.score(X_test, y_test))
    
    rfdeparture_mae.append(round(np.mean(errors), 2))

In [None]:
sum(rfdeparture_rsquared)/len(rfdeparture_rsquared)

In [None]:
sum(rfdeparture_mae)/len(rfdeparture_mae)

## RF Model - arrival

In [None]:
rfarrival_rsquared = []
rfarrival_mae = []

for k in arrstations:
    
    X_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])].drop(['trip_count'], axis=1).values
    X_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017].drop(['trip_count'], axis=1).values
    y_train = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year'].isin([2014,2015,2016])]['trip_count'].values
    y_test = arrival_joindf_inclusive.loc[k][arrival_joindf_inclusive.loc[k]['year']==2017]['trip_count'].values

    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    errors = abs(y_pred - y_test)

    rfarrival_rsquared.append(rf.score(X_test, y_test))
    
    rfarrival_mae.append(round(np.mean(errors), 2))

In [None]:
sum(rfarrival_rsquared)/len(rfarrival_rsquared)

In [None]:
sum(rfarrival_mae)/len(rfarrival_mae)