In [39]:
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt

## Read cleaned csv

In [40]:
df = pd.read_csv('combinedDF_MODELING_1.csv')

### Make sure the features are the correct datatypes

In [41]:
df.reset_index(drop=True, inplace=True)

In [42]:
df

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME
0,2018-01-01,5955344,25B,25B_271,2,33,1478,37184,37186,37210,1479.0,54.0
1,2018-01-01,5955346,25B,25B_271,2,33,1478,44934,44653,44653,1479.0,21.0
2,2018-01-01,5955349,25A,25A_270,2,33,1478,71051,70983,70983,1479.0,29.0
3,2018-01-01,5955351,25A,25A_270,2,33,1478,77408,77399,77399,1479.0,29.0
4,2018-01-01,5955353,25A,25A_270,2,33,1478,84608,84562,84562,1479.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7228363,2018-12-31,8591706,27,27_17,2,63,672,60037,60317,60328,4382.0,45.0
7228364,2018-12-31,8591908,15,15_17,2,52,672,32013,32487,32487,4382.0,16.0
7228365,2018-12-31,8591910,15,15_17,2,52,672,46010,45188,45210,4382.0,44.0
7228366,2018-12-31,8591912,15,15_17,2,52,672,58897,58718,58718,4382.0,21.0


In [43]:
df['DAYOFSERVICE'] = pd.to_datetime(df['DAYOFSERVICE'])

In [44]:
# month
df['MONTH'] = df['DAYOFSERVICE'].dt.month

In [45]:
# Day of the week
df['WEEKDAY'] = df['DAYOFSERVICE'].dt.weekday

In [46]:
# hour
df['HOUR'] = (df['ARR/DEP_PLAN']//60)//60

### Sort dataframe

In [47]:
df.sort_values(by=['DAYOFSERVICE', 'TRIPID', 'LINEID', 'startStop', 'endStop'], inplace=True, ignore_index=True)

## Check for negative journey times

In [49]:
df[(df['JOURNEYTIME']<0)]

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR
198706,2018-01-10,6103556,27A,27A_5,2,33,616,78469,78615,78627,617.0,-52.0,1,2,21
198708,2018-01-10,6103556,27A,27A_5,2,35,618,78539,81892,81892,619.0,-3179.0,1,2,21
262604,2018-01-13,6103433,37,37_15,2,50,1478,77931,78233,78233,1479.0,-21.0,1,5,21
416278,2018-01-21,6221375,83A,83A_26,2,20,1070,78035,79833,79833,1071.0,-1842.0,1,6,21
425012,2018-01-22,6214297,39,39_21,2,64,1478,41840,42439,42439,1479.0,-23.0,1,0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7032692,2018-12-19,8468292,145,145_105,2,70,1444,30510,38253,38253,1445.0,-7672.0,12,2,8
7078582,2018-12-21,8469032,15B,15B_60,1,13,1018,56124,69271,69271,1019.0,-12367.0,12,4,15
7123536,2018-12-24,8588235,31,31_18,2,47,614,59384,59725,59737,615.0,-398.0,12,0,16
7192053,2018-12-29,8587352,130,130_11,2,30,615,55246,55429,55429,616.0,-124.0,12,5,15


## If there negative journeytimes, remove them

In [50]:
# get a list of the dates with the negative journeytimes
neg = list(df[df['JOURNEYTIME']<0].index)

In [51]:
df.drop(neg, inplace=True)

### Final check for negative journey times

In [52]:
df[df['JOURNEYTIME']<0]

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR


In [53]:
df.reset_index(drop=True, inplace=True)

### Check for null JOURNEYTIMEs

In [54]:
df[df['JOURNEYTIME'].isna()]

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR


## Remove outliers in terms of journey time

In [17]:
df

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR
0,2018-01-01,5955277,16,16_20,1,26,44,31277,31102,31102,45.0,50.0,1,0,8
1,2018-01-01,5955277,16,16_20,1,28,45,31343,31152,31163,46.0,55.0,1,0,8
2,2018-01-01,5955277,16,16_20,1,29,46,31407,31207,31239,47.0,74.0,1,0,8
3,2018-01-01,5955277,16,16_20,1,30,47,31484,31281,31288,48.0,34.0,1,0,8
4,2018-01-01,5955277,16,16_20,1,31,48,31551,31315,31327,49.0,47.0,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11510228,2018-12-31,8592110,11,11_42,2,35,907,76743,77123,77123,908.0,14.0,12,0,21
11510229,2018-12-31,8592114,7A,7A_87,2,59,493,47228,47217,47236,494.0,59.0,12,0,13
11510230,2018-12-31,8592114,7A,7A_87,2,60,494,47316,47276,47296,495.0,74.0,12,0,13
11510231,2018-12-31,8592116,7A,7A_87,2,59,493,57610,58257,58273,494.0,67.0,12,0,16


### remove outliers in terms of the standard deviation

In [55]:
outliersIndexList = []

for startStop in df['startStop'].unique():
    
    stopDF = df[df['startStop']==startStop]
    
    for endStop in stopDF['endStop'].unique():
        
        workingDF0 = stopDF[stopDF['endStop']==endStop]
        
        upper = (workingDF0['JOURNEYTIME'].mean())+(3*(workingDF0['JOURNEYTIME'].std()))
        lower = (workingDF0['JOURNEYTIME'].mean())-(3*(workingDF0['JOURNEYTIME'].std()))
        
        outliers = workingDF0[(workingDF0['JOURNEYTIME']>upper)|(workingDF0['JOURNEYTIME']<lower)].index
        outliersIndexList.append(outliers)

In [56]:
OutliersIndexList = []

for i in outliersIndexList:
    for j in i:
        OutliersIndexList.append(j)

In [57]:
df.drop(df.index[OutliersIndexList], inplace=True)

In [58]:
df.reset_index(inplace=True, drop=True)

In [59]:
df

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR
0,2018-01-01,5955277,16,16_20,1,26,44,31277,31102,31102,7603.0,16.0,1,0,8
1,2018-01-01,5955277,16,16_20,1,28,45,31343,31152,31163,46.0,55.0,1,0,8
2,2018-01-01,5955277,16,16_20,1,29,46,31407,31207,31239,47.0,74.0,1,0,8
3,2018-01-01,5955277,16,16_20,1,30,47,31484,31281,31288,48.0,34.0,1,0,8
4,2018-01-01,5955277,16,16_20,1,31,48,31551,31315,31327,49.0,47.0,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7158332,2018-12-31,8592110,11,11_42,2,49,17,77964,78738,78748,18.0,46.0,12,0,21
7158333,2018-12-31,8592110,11,11_42,2,50,18,78022,78784,78784,19.0,42.0,12,0,21
7158334,2018-12-31,8592110,11,11_42,2,51,19,78046,78826,78865,21.0,75.0,12,0,21
7158335,2018-12-31,8592114,7A,7A_87,2,61,495,47426,47350,47368,400.0,54.0,12,0,13


## Merge with weather dataframe
- create DAYHOUR to combine two dataframes on

In [60]:
df['DAYHOUR'] = df['DAYOFSERVICE'] + pd.to_timedelta(df['HOUR'], unit='H')

In [61]:
weather = pd.read_csv('weatherData_CLEANED.csv')
weather['DAYHOUR'] = pd.to_datetime(weather['DAYHOUR'])

- drop descriptive weather features

In [62]:
weather.reset_index(drop=True, inplace=True)

In [63]:
weather.drop(columns=['weather_main', 'weather_description'], inplace=True)

- add precipitation feature

In [64]:
# change nan values to 0 for rain and snow
nullRain = weather.loc[pd.isnull(weather['rain_1h']), 'rain_1h'].index
weather.loc[nullRain, 'rain_1h'] = 0

nullSnow = weather.loc[pd.isnull(weather['snow_1h']), 'snow_1h'].index
weather.loc[nullSnow, 'snow_1h'] = 0

In [65]:
weather['precipitation_1h'] = ((weather['rain_1h']) + (weather['snow_1h']))

In [66]:
weather.drop(columns=['rain_1h', 'snow_1h'], inplace=True)

### Change precipitation values that are NaN to 0

In [67]:
df = pd.merge(df, weather, on=['DAYHOUR'])
df.drop(columns=['DAYHOUR'])

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PROGRNUMBER,startStop,ARR/DEP_PLAN,ARR_ACT,DEP_ACT,endStop,JOURNEYTIME,MONTH,WEEKDAY,HOUR,temp,humidity,wind_speed,precipitation_1h
0,2018-01-01,5955277,16,16_20,1,26,44,31277,31102,31102,7603.0,16.0,1,0,8,4.05,87,6.69,0.0
1,2018-01-01,5955277,16,16_20,1,28,45,31343,31152,31163,46.0,55.0,1,0,8,4.05,87,6.69,0.0
2,2018-01-01,5955277,16,16_20,1,29,46,31407,31207,31239,47.0,74.0,1,0,8,4.05,87,6.69,0.0
3,2018-01-01,5955277,16,16_20,1,30,47,31484,31281,31288,48.0,34.0,1,0,8,4.05,87,6.69,0.0
4,2018-01-01,5955277,16,16_20,1,31,48,31551,31315,31327,49.0,47.0,1,0,8,4.05,87,6.69,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7158332,2018-12-31,8590615,65B,65B_65,1,8,1017,21510,21435,21435,1018.0,16.0,12,0,5,8.86,81,3.10,0.0
7158333,2018-12-31,8590615,65B,65B_65,1,9,1018,21540,21451,21451,1019.0,10.0,12,0,5,8.86,81,3.10,0.0
7158334,2018-12-31,8590615,65B,65B_65,1,10,1019,21551,21461,21461,1020.0,12.0,12,0,5,8.86,81,3.10,0.0
7158335,2018-12-31,8590615,65B,65B_65,1,11,1020,21564,21473,21473,1076.0,18.0,12,0,5,8.86,81,3.10,0.0


In [68]:
df = df[['LINEID', 'DIRECTION', 'MONTH', 'WEEKDAY', 'HOUR', 'startStop', 'endStop', 'ARR_ACT', 'DEP_ACT',\
         'JOURNEYTIME', 'temp', 'humidity', 'wind_speed', 'precipitation_1h']]

In [69]:
df

Unnamed: 0,LINEID,DIRECTION,MONTH,WEEKDAY,HOUR,startStop,endStop,ARR_ACT,DEP_ACT,JOURNEYTIME,temp,humidity,wind_speed,precipitation_1h
0,16,1,1,0,8,44,7603.0,31102,31102,16.0,4.05,87,6.69,0.0
1,16,1,1,0,8,45,46.0,31152,31163,55.0,4.05,87,6.69,0.0
2,16,1,1,0,8,46,47.0,31207,31239,74.0,4.05,87,6.69,0.0
3,16,1,1,0,8,47,48.0,31281,31288,34.0,4.05,87,6.69,0.0
4,16,1,1,0,8,48,49.0,31315,31327,47.0,4.05,87,6.69,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7158332,65B,1,12,0,5,1017,1018.0,21435,21435,16.0,8.86,81,3.10,0.0
7158333,65B,1,12,0,5,1018,1019.0,21451,21451,10.0,8.86,81,3.10,0.0
7158334,65B,1,12,0,5,1019,1020.0,21461,21461,12.0,8.86,81,3.10,0.0
7158335,65B,1,12,0,5,1020,1076.0,21473,21473,18.0,8.86,81,3.10,0.0


## Save df as csv file

In [70]:
df.to_csv(f'STOPS_MODELING_1.csv', index_label=False)

In [71]:
sorted(list(df['startStop'].unique()))

[14,
 15,
 17,
 18,
 19,
 44,
 45,
 46,
 47,
 48,
 49,
 51,
 495,
 496,
 515,
 516,
 519,
 521,
 522,
 614,
 615,
 616,
 617,
 618,
 619,
 664,
 665,
 666,
 667,
 668,
 672,
 1016,
 1017,
 1018,
 1019,
 1020,
 1069,
 1070,
 1071,
 1285,
 1443,
 1444,
 1476,
 1478,
 1479,
 4384,
 4528,
 7453,
 7603,
 7659]