In [1]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import numpy as np

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from config import config

In [2]:
config=config()

In [3]:
print(config)

{'host': 'localhost', 'database': 'postgres', 'user': 'postgres', 'password': 'super_pw'}


In [2]:
engine=create_engine("postgresql://postgres:super_pw@localhost/postgres")

In [3]:
sql="SELECT lt.daystamp, lt.trip_id, lt.stoppoint_id,lt.departure_time_p,lt.departure_time_a,trips.departure_time_p,trips.departure_time_a,line_id,route_id,weather_main,temp FROM leavetimes AS lt, trips, weather WHERE trips.line_id='46A' AND trips.direction=1 AND lt.daystamp = trips.daystamp AND lt.trip_id = trips.trip_id AND lt.weather_id = weather.daytime"

In [4]:
df = pd.read_sql(sql,engine)

In [5]:
df.shape

(846260, 11)

In [6]:
df.head()

Unnamed: 0,daystamp,trip_id,stoppoint_id,departure_time_p,departure_time_a,departure_time_p.1,departure_time_a.1,line_id,route_id,weather_main,temp
0,1517961600,6253924,807,60960,60926,60960,60926.0,46A,74,Rain,3.08
1,1517961600,6253924,808,61053,61036,60960,60926.0,46A,74,Rain,3.08
2,1517961600,6253924,809,61107,61114,60960,60926.0,46A,74,Rain,3.08
3,1517961600,6253924,810,61193,61215,60960,60926.0,46A,74,Rain,3.08
4,1517961600,6253924,811,61246,61254,60960,60926.0,46A,74,Rain,3.08


In [7]:
df.columns=["daystamp","trip_id","stop_id","dep_p","dep_a","start_p","start_a","line_id","route_id","weather","temp"]

In [8]:
df.head()

Unnamed: 0,daystamp,trip_id,stop_id,dep_p,dep_a,start_p,start_a,line_id,route_id,weather,temp
0,1517961600,6253924,807,60960,60926,60960,60926.0,46A,74,Rain,3.08
1,1517961600,6253924,808,61053,61036,60960,60926.0,46A,74,Rain,3.08
2,1517961600,6253924,809,61107,61114,60960,60926.0,46A,74,Rain,3.08
3,1517961600,6253924,810,61193,61215,60960,60926.0,46A,74,Rain,3.08
4,1517961600,6253924,811,61246,61254,60960,60926.0,46A,74,Rain,3.08


In [9]:
def daystamp_converter(time):
    date=datetime.fromtimestamp(time)
    return (date.weekday(),date.month,date.hour)

In [10]:
df["datetime"]=df.daystamp.values+df.dep_p.values

In [11]:
df["weekday"],df["month"],df["hour"]=zip(*df['datetime'].apply(daystamp_converter))

In [12]:
df.head()

Unnamed: 0,daystamp,trip_id,stop_id,dep_p,dep_a,start_p,start_a,line_id,route_id,weather,temp,datetime,weekday,month,hour
0,1517961600,6253924,807,60960,60926,60960,60926.0,46A,74,Rain,3.08,1518022560,2,2,16
1,1517961600,6253924,808,61053,61036,60960,60926.0,46A,74,Rain,3.08,1518022653,2,2,16
2,1517961600,6253924,809,61107,61114,60960,60926.0,46A,74,Rain,3.08,1518022707,2,2,16
3,1517961600,6253924,810,61193,61215,60960,60926.0,46A,74,Rain,3.08,1518022793,2,2,16
4,1517961600,6253924,811,61246,61254,60960,60926.0,46A,74,Rain,3.08,1518022846,2,2,17


In [13]:
df["dur_s"]=df.dep_p.values-df.start_p.values
df["dur_a"]=df.dep_a.values-df.start_a.values

In [14]:
df_ml=df.drop(["daystamp","trip_id","dep_p","dep_a","start_p","start_a","datetime"],axis=1)

Filter out unusual routes:

In [15]:
route_counts=df_ml.route_id.value_counts()
indices=route_counts.index
values=route_counts.values
cum_value=0
size=df_ml.shape[0]
index=1
for value in values:
    cum_value+=value
    ratio=cum_value/size
    print(f"{index} most common route(s) cover {ratio*100:.2f}% of the routes.")
    if(ratio>0.8):
        break
    index+=1

1 most common route(s) cover 97.22% of the routes.


In [16]:
routes=indices[:index]
routes

Int64Index([74], dtype='int64')

In [17]:
df_ml=df_ml[df_ml.route_id.isin(routes)]

In [18]:
df_ml.shape

(822694, 10)

Check for null values

In [19]:
df_ml.isnull().sum()

stop_id         0
line_id         0
route_id        0
weather         0
temp            0
weekday         0
month           0
hour            0
dur_s           0
dur_a       18204
dtype: int64

In [20]:
df_ml.isnull().sum().sum()/df_ml.shape[0]

0.022127303711951225

In [21]:
df_ml = df_ml.dropna(axis = 0, how ='any') 

See how resulting stops line up:

In [22]:
stop_counts=df_ml.stop_id.value_counts()

In [23]:
stop_counts

807     14066
81      13857
813     13846
810     13846
811     13845
812     13842
809     13841
814     13839
818     13813
817     13803
819     13801
264     13785
758     13784
757     13775
808     13773
756     13767
847     13763
759     13759
848     13759
334     13758
406     13752
2795    13744
846     13729
6059    13720
747     13705
842     13685
845     13678
2007    13636
2009    13635
435     13621
2010    13619
2008    13619
7353    13578
760     13571
4571    13531
4636    13521
2013    13520
2016    13514
2020    13512
2014    13511
2017    13508
2018    13506
2039    13502
2022    13500
4565    13498
4567    13497
2015    13496
4566    13496
2019    13495
2021    13492
2032    13491
2031    13483
2034    13479
2035    13469
2033    13468
763     13446
762     13421
2036    13355
761     13165
Name: stop_id, dtype: int64

Finally, drop line_id and route_id (both have cardinality 1 at this point)

In [24]:
df_ml=df_ml.drop(["line_id","route_id"],axis=1)

In [25]:
df_ml.head()

Unnamed: 0,stop_id,weather,temp,weekday,month,hour,dur_s,dur_a
0,807,Rain,3.08,2,2,16,0,0.0
1,808,Rain,3.08,2,2,16,93,110.0
2,809,Rain,3.08,2,2,16,147,188.0
3,810,Rain,3.08,2,2,16,233,289.0
4,811,Rain,3.08,2,2,17,286,328.0


In [26]:
df_ml_810=df[df.stop_id==810]

In [31]:
df_ml_810

Unnamed: 0,daystamp,trip_id,stop_id,dep_p,dep_a,start_p,start_a,line_id,route_id,weather,temp,datetime,weekday,month,hour,dur_s,dur_a
3,1517961600,6253924,810,61193,61215,60960,60926.0,46A,74,Rain,3.08,1518022793,2,2,16,233,289.0
62,1517961600,6262164,810,24694,24930,24480,24697.0,46A,74,Clouds,-1.60,1517986294,2,2,6,214,233.0
121,1520726400,6397655,810,83817,83837,83700,83653.0,46A,74,Rain,6.99,1520810217,6,3,23,117,184.0
180,1520726400,6392156,810,77517,77506,77400,77400.0,46A,74,Rain,6.99,1520803917,6,3,21,117,106.0
239,1520726400,6392515,810,75717,75740,75600,75633.0,46A,74,Clouds,6.95,1520802117,6,3,21,117,107.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845995,1525215600,6661770,810,73973,73900,73800,73806.0,46A,74,Clouds,8.18,1525289573,2,5,20,173,94.0
846010,1527375600,6838615,810,82019,81997,81900,81884.0,46A,74,Clouds,13.30,1527457619,6,5,22,119,113.0
846019,1527375600,6834430,810,69454,69648,69300,69420.0,46A,74,Clouds,16.16,1527445054,6,5,19,154,228.0
846111,1527375600,6841639,810,71254,71220,71100,71104.0,46A,74,Clouds,16.12,1527446854,6,5,19,154,116.0


In [33]:
df_weekday=pd.get_dummies(df_ml.weekday)
df_month=pd.get_dummies(df_ml.month)
df_hour=pd.get_dummies(df_ml.hour)
df_weather=pd.get_dummies(df_ml.weather)
df_stops=pd.get_dummies(df_ml.stop_id)

In [35]:
#trying to predict dur_s
y_prior=df_ml["dur_s"]
df_test_prior=pd.concat([df_weekday,df_month,df_hour,df_stops],axis=1)
X=df_test_prior
X_train,X_test,y_train,y_test=train_test_split(X,y_prior,random_state=1)
regr_relu=MLPRegressor(activation="relu",random_state=1, max_iter=100, verbose=1).fit(X_train, y_train)

Iteration 1, loss = 1473474.71818916
Iteration 2, loss = 71537.61385757
Iteration 3, loss = 30301.03353516
Iteration 4, loss = 30074.95613245
Iteration 5, loss = 29754.17051757
Iteration 6, loss = 29017.42552239
Iteration 7, loss = 27655.04655788
Iteration 8, loss = 25951.16837803
Iteration 9, loss = 24328.44739158
Iteration 10, loss = 22920.27918488
Iteration 11, loss = 21781.92184747
Iteration 12, loss = 20870.79341905
Iteration 13, loss = 19741.30282899
Iteration 14, loss = 17775.62654139
Iteration 15, loss = 15940.95709843
Iteration 16, loss = 14762.55091107
Iteration 17, loss = 13975.95417837
Iteration 18, loss = 13386.53698139
Iteration 19, loss = 12916.72066950
Iteration 20, loss = 12537.54644560
Iteration 21, loss = 12239.41250171
Iteration 22, loss = 12024.51198073
Iteration 23, loss = 11861.87884484
Iteration 24, loss = 11742.29621104
Iteration 25, loss = 11649.07811976
Iteration 26, loss = 11560.61593871
Iteration 27, loss = 11478.33028966
Iteration 28, loss = 11397.36019312



In [36]:
regr_relu.score(X_test, y_test)

0.9881124114587736

Above's neural network took about 10 mins to train. If schedule info isn't available elsewhere, we could use one predictor for the scheduled time and a second for the actual after the scheduled time was predicted.

In [40]:
reg = LinearRegression().fit(X_train, y_train)

In [41]:
reg.score(X_test, y_test)

0.9653054039628538

For comparison, a linear regressor takes about 5 secs to train and make predictions

In [42]:
y=df_ml["dur_a"]
df_test=df_ml.drop(["stop_id","weather","dur_a"],axis=1)

In [43]:
df_test.head()

Unnamed: 0,temp,weekday,month,hour,dur_s
0,3.08,2,2,16,0
1,3.08,2,2,16,93
2,3.08,2,2,16,147
3,3.08,2,2,16,233
4,3.08,2,2,17,286


In [45]:
df_test=df_test.drop(["weekday","month","hour"],axis=1)

In [46]:
df_test=pd.concat([df_test,df_weather,df_weekday,df_month,df_hour],axis=1)

In [47]:
df_test.shape

(804490, 42)

In [48]:
X=df_test

In [49]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

In [50]:
regr_relu=MLPRegressor(activation="relu",random_state=1, max_iter=100, verbose=1).fit(X_train, y_train)

Iteration 1, loss = 119204.66950077
Iteration 2, loss = 71929.71894800
Iteration 3, loss = 70293.85671083
Iteration 4, loss = 69369.41826543
Iteration 5, loss = 68973.24066100
Iteration 6, loss = 68594.58577374
Iteration 7, loss = 68641.13589531
Iteration 8, loss = 68392.02902936
Iteration 9, loss = 68335.92583998
Iteration 10, loss = 68103.48492222
Iteration 11, loss = 68105.74877054
Iteration 12, loss = 67933.58056800
Iteration 13, loss = 67843.77519515
Iteration 14, loss = 67728.06374080
Iteration 15, loss = 67692.75523335
Iteration 16, loss = 67499.14183942
Iteration 17, loss = 67448.09497662
Iteration 18, loss = 67436.78215532
Iteration 19, loss = 67222.37520219
Iteration 20, loss = 67137.80627570
Iteration 21, loss = 67196.58358438
Iteration 22, loss = 67033.63211109
Iteration 23, loss = 66910.67694706
Iteration 24, loss = 66881.95982112
Iteration 25, loss = 66881.48469560
Iteration 26, loss = 66880.34302119
Iteration 27, loss = 66716.85660354
Iteration 28, loss = 66619.22348658




In [51]:
regr_relu.score(X_test, y_test)

0.9376675098304893

Model above took about 12 mins to train

First model try took almost 2 hours to run, good starting value, but below 64k after around 100 iterations and ended up at 61k, converging after 430 iterations. accuracy of 0.937145. That model included stop_id dummy encoded and day instead of workday. none of the dates were dummy encoded.