# Analysing the delays in a morning peak

This notebook takes a all the raw data from a single delay, and determines the number of delays, and the worst delay for a set time period in the day.

In [1]:
import zipfile
import io

In [2]:
file_name = '20190123.zip'
with open(file_name, "rb") as f:
    z = zipfile.ZipFile(io.BytesIO(f.read()))

z.extractall()
print("Extracted file " + file_name)

Extracted file 20190123.zip


We have a few directories to dive into before we get to the good stuff...

In [3]:
import os
os.chdir('home/pi/sydney-transport-tracker/data/raw/20190123/')
files = os.listdir('.')
for name in files:
    print(name)

135001.pickle
021002.pickle
112402.pickle
164802.pickle
064002.pickle
084401.pickle
210001.pickle
070601.pickle
082201.pickle
042601.pickle
213202.pickle
170802.pickle
072602.pickle
091601.pickle
081201.pickle
234001.pickle
030401.pickle
121001.pickle
165001.pickle
102001.pickle
202801.pickle
220002.pickle
104402.pickle
073202.pickle
111001.pickle
040601.pickle
175202.pickle
235001.pickle
061201.pickle
100202.pickle
025802.pickle
093001.pickle
082602.pickle
080401.pickle
002601.pickle
055401.pickle
233402.pickle
015202.pickle
140401.pickle
191002.pickle
134201.pickle
164001.pickle
155801.pickle
151801.pickle
025002.pickle
235201.pickle
212602.pickle
021201.pickle
080001.pickle
153602.pickle
080602.pickle
121802.pickle
072201.pickle
074201.pickle
132001.pickle
085802.pickle
030202.pickle
095202.pickle
142402.pickle
123401.pickle
174002.pickle
190401.pickle
204601.pickle
071802.pickle
102602.pickle
014401.pickle
101602.pickle
031002.pickle
213401.pickle
052201.pickle
152602.pickle
093602

## What services are running today?
The services are determined in `calendar.txt`, with those service IDs used to filter out everything in `trips.txt`. From there we have trip IDs, which can be used to filter out the scheduled stop times in `stop_times.txt`.

`calendar.txt` -> `service_id` -> `trips.txt` -> `trip_id` -> `stop_times.txt` -> `arrival_time`, `departure_time`

In [4]:
import datetime
import pandas as pd
import csv

day_of_analysis = 'wednesday'
date_of_analysis = datetime.datetime.strptime('20190123', "%Y%m%d").date()
todays_services = []

with open('calendar.txt', mode='r', encoding='utf-8-sig') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    line_count = 0
    for row in csv_reader:
        if row[day_of_analysis] == '1':
            start_date = datetime.datetime.strptime(row['start_date'], "%Y%m%d").date()
            end_date = datetime.datetime.strptime(row['end_date'], "%Y%m%d").date()
            if start_date <= date_of_analysis <= end_date:
                todays_services.append(row['service_id'])

print("Todays services = " )
print(', '.join(todays_services))

Todays services = 
1260.122.112, 1260.122.116, 1260.122.120, 1260.122.124, 1260.122.16, 1260.122.20, 1260.122.24, 1260.122.28, 1260.122.48, 1260.122.52, 1260.122.56, 1260.122.60, 1260.122.80, 1260.122.84, 1260.122.88, 1260.122.92


In [5]:
df_trips = pd.read_csv('trips.txt',
                       header=0,
                       encoding='utf-8-sig',
                       usecols=["route_id", "service_id", "trip_id", "trip_short_name"])
df_filtered_trips = df_trips[df_trips['service_id'].isin(todays_services)]
pd.options.display.max_rows = 10
print(df_trips)

       route_id    service_id                         trip_id  trip_short_name
0        BNK_2a    1260.122.4    1--A.1260.122.4.M.8.55188158              NaN
1        BNK_2a   1260.122.48   1--A.1260.122.48.M.8.55188157              NaN
2        BNK_2a   1260.122.64   1--A.1260.122.64.M.8.55188157              NaN
3        BNK_2a    1260.122.8    1--A.1260.122.8.M.8.55188158              NaN
4      RTTA_DEF  1603.103.128  1--A.1603.103.128.M.8.54724494              NaN
...         ...           ...                             ...              ...
57308   CTY_W2a    1620.100.2    WT28.1620.100.2.X.5.55042064              NaN
57309   CTY_W2a    1620.100.4    WT28.1620.100.4.X.5.55042062              NaN
57310   CTY_W2a   483.101.120   WT28.483.101.120.X.5.55277012              NaN
57311   CTY_W2a    487.111.60    WT28.487.111.60.X.5.55310994              NaN
57312   CTY_W2a    487.111.64    WT28.487.111.64.X.5.55310994              NaN

[57313 rows x 4 columns]


Gosh, that's a lot of services... What are RTTA_DEF and RTTA_REV... and are those CountryLink services??? Let's filter out some of these services as we're only going to analyse what is going on in the general Sydney commuter network.

In [6]:
ROUTES_TO_IGNORE = ["CTY_NC1", "CTY_NC1a", "CTY_NC2", "CTY_NW1a", "CTY_NW1b", "CTY_NW1c", "CTY_NW1d",
                    "CTY_NW2a", "CTY_NW2b", "CTY_S1a", "CTY_S1b", "CTY_S1c", "CTY_S1d", "CTY_S1e",
                    "CTY_S1f", "CTY_S1g", "CTY_S1h", "CTY_S1i", "CTY_S2a", "CTY_S2b", "CTY_S2c",
                    "CTY_S2d", "CTY_S2e", "CTY_S2f", "CTY_S2g", "CTY_S2h", "CTY_S2i", "CTY_W1a",
                    "CTY_W1b", "CTY_W2a", "CTY_W2b", "HUN_1a", "HUN_1b", "HUN_2a", "HUN_2b",
                    "RTTA_DEF", "RTTA_REV"]
df_trips = df_trips[~df_trips['route_id'].isin(ROUTES_TO_IGNORE)]
print(df_trips)

      route_id   service_id                        trip_id  trip_short_name
0       BNK_2a   1260.122.4   1--A.1260.122.4.M.8.55188158              NaN
1       BNK_2a  1260.122.48  1--A.1260.122.48.M.8.55188157              NaN
2       BNK_2a  1260.122.64  1--A.1260.122.64.M.8.55188157              NaN
3       BNK_2a   1260.122.8   1--A.1260.122.8.M.8.55188158              NaN
6       BNK_2a  1603.103.60  1--A.1603.103.60.M.8.54724492              NaN
...        ...          ...                            ...              ...
57276    BMT_2   483.101.56   WN18.483.101.56.N.2.55278207              NaN
57277    BMT_2   483.101.64   WN18.483.101.64.N.2.55278207              NaN
57278    BMT_2    487.111.4    WN18.487.111.4.N.2.55308164              NaN
57279    BMT_2   487.111.56   WN18.487.111.56.N.2.55308164              NaN
57280    BMT_2   487.111.64   WN18.487.111.64.N.2.55308164              NaN

[47941 rows x 4 columns]


Now, the `stop_times.txt` file contains the stop files from across a number of days. We can filter out which services we want by only looking at trips that are running today.

In [7]:
df_stop_times = pd.read_csv('stop_times.txt', header=0,
                            encoding='utf-8-sig',
                            dtype={'stop_id': str},
                            usecols=["trip_id", "arrival_time", "departure_time", "stop_id"],
                            parse_dates=['arrival_time', 'departure_time'])

# remove any trips from stop_times that did NOT happen on this date
df_filtered_stop_times = df_stop_times[df_stop_times['trip_id'].isin(df_filtered_trips['trip_id'])]
print(df_filtered_stop_times)

                               trip_id arrival_time departure_time  stop_id
10       1--A.1260.122.48.M.8.55188157     03:52:00       03:52:00  2144243
11       1--A.1260.122.48.M.8.55188157     03:54:12       03:55:00  2141313
12       1--A.1260.122.48.M.8.55188157     03:57:30       03:57:30   214063
13       1--A.1260.122.48.M.8.55188157     03:58:42       03:58:42   214074
14       1--A.1260.122.48.M.8.55188157     04:01:24       04:01:24  2135234
...                                ...          ...            ...      ...
1033592  WT28.1260.122.60.X.5.55187037     20:26:24       20:26:24   214072
1033593  WT28.1260.122.60.X.5.55187037     20:27:36       20:29:12  2135232
1033594  WT28.1260.122.60.X.5.55187037     20:31:24       20:31:24   213491
1033595  WT28.1260.122.60.X.5.55187037     20:39:06       20:39:06  2015133
1033596  WT28.1260.122.60.X.5.55187037     20:42:24       23:34:00  2000325

[73135 rows x 4 columns]
