# Merging delays with the schedules
With the daily schedule formatted into two tables in the first notebook, and delay information summarised into a single object in the second notebook, it's now time to merge the information so that we can easily get insights into how the trains ran that day.

## Full schedule with delays
We want today's timetable, but with the delays on a per stop basis.
Import the dataset formatted from the first notebook.

In [1]:
import pandas as pd
pd.options.display.max_rows = 10
df_stop_times = pd.read_pickle('stop_times.pickle')
df_stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id
10,1--A.1260.122.48.M.8.55188157,03:52:00,03:52:00,2144243
11,1--A.1260.122.48.M.8.55188157,03:54:12,03:55:00,2141313
12,1--A.1260.122.48.M.8.55188157,03:57:30,03:57:30,214063
13,1--A.1260.122.48.M.8.55188157,03:58:42,03:58:42,214074
14,1--A.1260.122.48.M.8.55188157,04:01:24,04:01:24,2135234
...,...,...,...,...
1032152,WN18.1260.122.56.N.2.55187511,22:23:00,22:23:00,279536
1032153,WN18.1260.122.56.N.2.55187511,22:49:00,22:49:00,27874
1032154,WN18.1260.122.56.N.2.55187511,23:04:30,23:04:30,2790154
1032155,WN18.1260.122.56.N.2.55187511,23:12:00,23:12:00,284515


Add columns for arrival and departure times and delays, and schedule relationship. Mark all as N/A to start with, so we know which things never had real time updates

In [2]:
df_stop_times.insert(2, 'arrival_delay', 'N/A')
df_stop_times.insert(3, 'actual_arrival_time', 'N/A')
df_stop_times.insert(5, 'departure_delay', 'N/A')
df_stop_times.insert(6, 'actual_departure_time', 'N/A')
df_stop_times.insert(7, 'schedule_relationship', 'N/A')
df_stop_times

Unnamed: 0,trip_id,arrival_time,arrival_delay,actual_arrival_time,departure_time,departure_delay,actual_departure_time,schedule_relationship,stop_id
10,1--A.1260.122.48.M.8.55188157,03:52:00,,,03:52:00,,,,2144243
11,1--A.1260.122.48.M.8.55188157,03:54:12,,,03:55:00,,,,2141313
12,1--A.1260.122.48.M.8.55188157,03:57:30,,,03:57:30,,,,214063
13,1--A.1260.122.48.M.8.55188157,03:58:42,,,03:58:42,,,,214074
14,1--A.1260.122.48.M.8.55188157,04:01:24,,,04:01:24,,,,2135234
...,...,...,...,...,...,...,...,...,...
1032152,WN18.1260.122.56.N.2.55187511,22:23:00,,,22:23:00,,,,279536
1032153,WN18.1260.122.56.N.2.55187511,22:49:00,,,22:49:00,,,,27874
1032154,WN18.1260.122.56.N.2.55187511,23:04:30,,,23:04:30,,,,2790154
1032155,WN18.1260.122.56.N.2.55187511,23:12:00,,,23:12:00,,,,284515


Then import the single delays object, and merge the data. Be warned, this is a very time consuming operation.

In [3]:
import pickle
import sys
sys.path.append('../')
from src.features.trip_objects import *
from src.features.trip_helper import *

merged_delays = pickle.load(open('merged_delays.pickle', 'rb'))

# iterate through all the trips we 
for trip in merged_delays.values():
    df_stop_times_this_trip = df_stop_times[(df_stop_times['trip_id'] == trip.trip_id)]
    for stop_time_update in trip.stop_time_updates.values():
        # some of these values might be 24:00, 25:00 etc to signify next day

        idx = df_stop_times_this_trip[(df_stop_times_this_trip['stop_id'] == stop_time_update.stop_id)].index
        if idx.empty:
            # it shouldn't be
            continue
            
        idx = idx.item()

        # calculate the real time
        actual_arrival_time = update_time('20190124', df_stop_times_this_trip.at[idx, 'arrival_time'],
                                               stop_time_update.arrival_delay)
        actual_departure_time = update_time('20190124', df_stop_times_this_trip.at[idx, 'departure_time'],
                                                 stop_time_update.departure_delay)

        # add the new values to the new columns
        df_stop_times.at[idx, 'arrival_delay'] = stop_time_update.arrival_delay
        df_stop_times.at[idx, 'actual_arrival_time'] = actual_arrival_time
        df_stop_times.at[idx, 'departure_delay'] = stop_time_update.departure_delay
        df_stop_times.at[idx, 'actual_departure_time'] = actual_departure_time
        df_stop_times.at[idx, 'schedule_relationship'] = stop_time_update.schedule_relationship

df_stop_times

Unnamed: 0,trip_id,arrival_time,arrival_delay,actual_arrival_time,departure_time,departure_delay,actual_departure_time,schedule_relationship,stop_id
10,1--A.1260.122.48.M.8.55188157,03:52:00,,,03:52:00,,,,2144243
11,1--A.1260.122.48.M.8.55188157,03:54:12,,,03:55:00,,,,2141313
12,1--A.1260.122.48.M.8.55188157,03:57:30,,,03:57:30,,,,214063
13,1--A.1260.122.48.M.8.55188157,03:58:42,,,03:58:42,,,,214074
14,1--A.1260.122.48.M.8.55188157,04:01:24,,,04:01:24,,,,2135234
...,...,...,...,...,...,...,...,...,...
1032152,WN18.1260.122.56.N.2.55187511,22:23:00,,,22:23:00,,,,279536
1032153,WN18.1260.122.56.N.2.55187511,22:49:00,,,22:49:00,,,,27874
1032154,WN18.1260.122.56.N.2.55187511,23:04:30,,,23:04:30,,,,2790154
1032155,WN18.1260.122.56.N.2.55187511,23:12:00,,,23:12:00,,,,284515


Let's see if we have any arrival delay statistics:

In [4]:
df_stop_times[df_stop_times['arrival_delay'] != 'N/A']

Unnamed: 0,trip_id,arrival_time,arrival_delay,actual_arrival_time,departure_time,departure_delay,actual_departure_time,schedule_relationship,stop_id
18,1--A.1260.122.48.M.8.55188157,04:12:06,0,04:12:06,04:13:00,0,04:13:00,0,2015133
19,1--A.1260.122.48.M.8.55188157,04:15:30,0,04:15:30,04:17:00,0,04:17:00,0,2000336
143,1--B.1260.122.48.M.8.55188160,04:15:31,0,04:15:31,04:17:01,0,04:17:01,0,2000336
144,1--B.1260.122.48.M.8.55188160,04:19:48,0,04:19:48,04:20:48,0,04:20:48,0,2000396
145,1--B.1260.122.48.M.8.55188160,04:22:48,0,04:22:48,04:23:30,0,04:23:30,0,2000406
...,...,...,...,...,...,...,...,...,...
1031435,WN17.1260.122.56.N.2.55187512,19:05:06,0,19:05:06,19:05:36,0,19:05:36,0,2777192
1031444,WN17.1260.122.56.N.2.55187512,19:37:30,0,19:37:30,19:38:30,0,19:38:30,0,2780201
1031447,WN17.1260.122.56.N.2.55187512,19:57:30,0,19:57:30,19:58:00,0,19:58:00,0,278652
1031450,WN17.1260.122.56.N.2.55187512,20:23:00,314,20:28:14,20:24:00,284,20:28:44,0,2790142


In [5]:
df_stop_times.to_pickle('stop_times_merged.pickle')

## Trips
Now import the trips table, and merge the insights from the delay object into it.

In [6]:
df_trips = pd.read_pickle('trips.pickle')
df_trips

Unnamed: 0,route_id,service_id,trip_id
1,BNK_2a,1260.122.48,1--A.1260.122.48.M.8.55188157
19,APS_1a,1260.122.48,1--B.1260.122.48.M.8.55188160
37,APS_2a,1260.122.48,1--C.1260.122.48.M.8.55188159
55,APS_1a,1260.122.48,1--D.1260.122.48.M.8.55188306
73,APS_2a,1260.122.48,1--E.1260.122.48.M.8.55188307
...,...,...,...
56917,BMT_1,1260.122.32,W597.1260.122.32.V.4.55188855
57214,BMT_1,1260.122.56,WN11.1260.122.56.N.2.55190142
57224,BMT_2,1260.122.56,WN12.1260.122.56.N.2.55188260
57244,BMT_1,1260.122.56,WN17.1260.122.56.N.2.55187512


Add columns for the start and end timestamps, the delay averages and maxima, and the schedule relationship of each trip.

In [7]:
df_trips.insert(0, 'start_timestamp', 'N/A')
df_trips.insert(1, 'end_timestamp', 'N/A')
df_trips.insert(5, 'maximum_arrival_delay', 0)
df_trips.insert(6, 'average_arrival_delay', 0)
df_trips.insert(7, 'maximum_departure_delay', 0)
df_trips.insert(8, 'average_departure_delay', 0)
df_trips.insert(9, 'schedule_relationship', 0)
df_trips

Unnamed: 0,start_timestamp,end_timestamp,route_id,service_id,trip_id,maximum_arrival_delay,average_arrival_delay,maximum_departure_delay,average_departure_delay,schedule_relationship
1,,,BNK_2a,1260.122.48,1--A.1260.122.48.M.8.55188157,0,0,0,0,0
19,,,APS_1a,1260.122.48,1--B.1260.122.48.M.8.55188160,0,0,0,0,0
37,,,APS_2a,1260.122.48,1--C.1260.122.48.M.8.55188159,0,0,0,0,0
55,,,APS_1a,1260.122.48,1--D.1260.122.48.M.8.55188306,0,0,0,0,0
73,,,APS_2a,1260.122.48,1--E.1260.122.48.M.8.55188307,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
56917,,,BMT_1,1260.122.32,W597.1260.122.32.V.4.55188855,0,0,0,0,0
57214,,,BMT_1,1260.122.56,WN11.1260.122.56.N.2.55190142,0,0,0,0,0
57224,,,BMT_2,1260.122.56,WN12.1260.122.56.N.2.55188260,0,0,0,0,0
57244,,,BMT_1,1260.122.56,WN17.1260.122.56.N.2.55187512,0,0,0,0,0


Add the insights for each trip to the corresponding row of the table.

In [8]:
for trip in merged_delays.values():

    idx = df_trips[(df_trips['trip_id'] == trip.trip_id)].index
    if idx.empty:
        # it shouldn't be
        continue

    idx = idx.item()

    df_trips.at[idx, 'maximum_arrival_delay'] = trip.maximum_arrival_delay()
    df_trips.at[idx, 'average_arrival_delay'] = trip.average_arrival_delay()
    df_trips.at[idx, 'maximum_departure_delay'] = trip.maximum_departure_delay()
    df_trips.at[idx, 'average_departure_delay'] = trip.average_departure_delay()
    df_trips.at[idx, 'schedule_relationship'] = trip.overall_schedule_relationship()

df_trips

Unnamed: 0,start_timestamp,end_timestamp,route_id,service_id,trip_id,maximum_arrival_delay,average_arrival_delay,maximum_departure_delay,average_departure_delay,schedule_relationship
1,,,BNK_2a,1260.122.48,1--A.1260.122.48.M.8.55188157,0,0,0,0,0
19,,,APS_1a,1260.122.48,1--B.1260.122.48.M.8.55188160,26,2,4,0,0
37,,,APS_2a,1260.122.48,1--C.1260.122.48.M.8.55188159,0,0,0,0,0
55,,,APS_1a,1260.122.48,1--D.1260.122.48.M.8.55188306,0,0,0,0,0
73,,,APS_2a,1260.122.48,1--E.1260.122.48.M.8.55188307,59,13,59,24,0
...,...,...,...,...,...,...,...,...,...,...
56917,,,BMT_1,1260.122.32,W597.1260.122.32.V.4.55188855,0,0,0,0,0
57214,,,BMT_1,1260.122.56,WN11.1260.122.56.N.2.55190142,0,0,0,0,0
57224,,,BMT_2,1260.122.56,WN12.1260.122.56.N.2.55188260,270,95,327,107,0
57244,,,BMT_1,1260.122.56,WN17.1260.122.56.N.2.55187512,314,66,284,31,0


For every trip in the table, add the full start and end timestamps. This makes it easier to deal with information when a trip runs from very late to very early the next day, and the dates change.

In [9]:
for i in df_trips.index:
    departure_series = df_stop_times[df_stop_times['trip_id'] == df_trips.at[i, 'trip_id']]['departure_time']
    if len(departure_series) < 2:
        continue
    df_trips.at[i, 'start_timestamp'] = convert_to_timestamp('20190124', departure_series.iloc[0])
    df_trips.at[i, 'end_timestamp'] = convert_to_timestamp('20190124', departure_series.iloc[-1])

df_trips

Unnamed: 0,start_timestamp,end_timestamp,route_id,service_id,trip_id,maximum_arrival_delay,average_arrival_delay,maximum_departure_delay,average_departure_delay,schedule_relationship
1,2019-01-24 03:52:00,2019-01-24 04:17:00,BNK_2a,1260.122.48,1--A.1260.122.48.M.8.55188157,0,0,0,0,0
19,2019-01-24 04:17:01,2019-01-24 04:58:00,APS_1a,1260.122.48,1--B.1260.122.48.M.8.55188160,26,2,4,0,0
37,2019-01-24 04:58:01,2019-01-24 05:17:00,APS_2a,1260.122.48,1--C.1260.122.48.M.8.55188159,0,0,0,0,0
55,2019-01-24 05:17:01,2019-01-24 05:58:00,APS_1a,1260.122.48,1--D.1260.122.48.M.8.55188306,0,0,0,0,0
73,2019-01-24 05:58:01,2019-01-24 06:35:00,APS_2a,1260.122.48,1--E.1260.122.48.M.8.55188307,59,13,59,24,0
...,...,...,...,...,...,...,...,...,...,...
56917,2019-01-25 01:18:01,2019-01-25 04:18:00,BMT_1,1260.122.32,W597.1260.122.32.V.4.55188855,0,0,0,0,0
57214,2019-01-24 03:42:00,2019-01-24 05:46:00,BMT_1,1260.122.56,WN11.1260.122.56.N.2.55190142,0,0,0,0,0
57224,2019-01-24 05:46:01,2019-01-24 11:02:00,BMT_2,1260.122.56,WN12.1260.122.56.N.2.55188260,270,95,327,107,0
57244,2019-01-24 17:47:01,2019-01-24 22:17:00,BMT_1,1260.122.56,WN17.1260.122.56.N.2.55187512,314,66,284,31,0


What were the worst delays?

In [10]:
df_trips.sort_values(by=['maximum_departure_delay'], ascending=False).head(n=10)

Unnamed: 0,start_timestamp,end_timestamp,route_id,service_id,trip_id,maximum_arrival_delay,average_arrival_delay,maximum_departure_delay,average_departure_delay,schedule_relationship
17000,2019-01-24 08:33:01,2019-01-24 08:59:00,BNK_2b,1260.122.124,21-B.1260.122.124.S.8.55187552,3406,2937,3447,2959,0
16297,2019-01-24 08:30:01,2019-01-24 10:00:00,APS_1a,1260.122.48,2--D.1260.122.48.M.8.55189529,3075,1064,3045,959,1
41994,2019-01-24 10:01:00,2019-01-24 10:19:00,BNK_1c,1260.122.56,89-E.1260.122.56.A.8.55186562,3011,2784,2956,1551,0
55725,2019-01-24 03:00:01,2019-01-24 06:23:00,BMT_2,1260.122.48,W506.1260.122.48.V.4.55186351,2740,2058,2740,2096,0
16995,2019-01-24 07:14:00,2019-01-24 08:33:00,IWL_2d,1260.122.124,21-A.1260.122.124.S.8.55187550,2745,408,2715,492,0
109,2019-01-24 07:42:01,2019-01-24 08:24:00,IWL_2b,1260.122.48,1--G.1260.122.48.M.8.55187600,2581,151,2551,150,0
43774,2019-01-24 08:39:01,2019-01-24 09:41:30,BNK_1a,1260.122.60,94-F.1260.122.60.A.8.55189687,2434,1455,2434,1448,0
16281,2019-01-24 07:11:01,2019-01-24 08:30:00,IWL_2d,1260.122.48,2--C.1260.122.48.M.8.55189530,2435,86,2405,85,0
43765,2019-01-24 07:57:01,2019-01-24 08:39:00,IWL_2b,1260.122.60,94-E.1260.122.60.A.8.55189686,2386,140,2356,138,0
22999,2019-01-24 08:18:01,2019-01-24 08:43:00,BNK_2b,1260.122.56,47-F.1260.122.56.K.8.55187941,2356,2302,2326,1965,0


84 minutes!

In [11]:
df_trips.to_pickle('trips_insights.pickle')

In [12]:
import matplotlib.pyplot as plt
df_trips.plot(x="start_timestamp", y=["maximum_arrival_delay"])

<matplotlib.axes._subplots.AxesSubplot at 0x7f9312c8f470>

In [13]:
df_trips_morning = df_trips[df_trips['start_timestamp'] > datetime.datetime(2019,1,24,6,0)]
pd.options.display.max_rows = 70
df_trips_morning[df_trips_morning['route_id'].str.contains("IWL")].sort_values(by=['start_timestamp']).head(n=70)

Unnamed: 0,start_timestamp,end_timestamp,route_id,service_id,trip_id,maximum_arrival_delay,average_arrival_delay,maximum_departure_delay,average_departure_delay,schedule_relationship
33254,2019-01-24 06:02:01,2019-01-24 06:47:00,IWL_2b,1260.122.60,7--E.1260.122.60.M.8.55188300,57,3,57,4,0
22972,2019-01-24 06:05:01,2019-01-24 07:20:00,IWL_2d,1260.122.56,47-C.1260.122.56.K.8.55186830,0,0,26,1,0
37543,2019-01-24 06:05:01,2019-01-24 07:12:00,IWL_1a,1260.122.32,8--D.1260.122.32.M.8.55186098,95,4,95,3,0
40907,2019-01-24 06:12:01,2019-01-24 07:54:00,IWL_1c,1260.122.40,86-D.1260.122.40.A.8.55189061,70,7,47,5,0
22455,2019-01-24 06:14:00,2019-01-24 07:33:00,IWL_2d,1260.122.120,44-A.1260.122.120.K.8.55189188,28,2,23,1,0
39922,2019-01-24 06:17:01,2019-01-24 07:02:00,IWL_2b,1260.122.124,84-D.1260.122.124.A.8.55185955,33,2,6,0,0
19913,2019-01-24 06:20:01,2019-01-24 07:27:00,IWL_1a,1260.122.120,3--E.1260.122.120.M.8.55189646,28,3,28,4,0
2253,2019-01-24 06:24:00,2019-01-24 07:42:00,IWL_2d,1260.122.100,11-A.1260.122.100.M.8.55187627,71,5,71,5,0
41295,2019-01-24 06:27:01,2019-01-24 08:09:00,IWL_1c,1260.122.56,87-D.1260.122.56.A.8.55187081,75,13,75,14,0
44515,2019-01-24 06:32:01,2019-01-24 07:51:00,IWL_2d,1260.122.32,97-C.1260.122.32.A.8.55187090,14,0,14,0,0
