# Analysing the delays in a morning peak

This notebook takes a all the raw data from a single delay, and determines the number of delays, and the worst delay for a set time period in the day.

In [None]:
import zipfile
import io

In [None]:
file_name = '20190123.zip'
with open(file_name, "rb") as f:
    z = zipfile.ZipFile(io.BytesIO(f.read()))

z.extractall()
print("Extracted file " + file_name)

We have a few directories to dive into before we get to the good stuff...

In [None]:
import os
os.chdir('home/pi/sydney-transport-tracker/data/raw/20190123/')
files = os.listdir('.')
for name in files:
    print(name)

## What services are running today?
The services are determined in `calendar.txt`, with those service IDs used to filter out everything in `trips.txt`.

In [None]:
import datetime
import pandas as pd
import csv

day_of_analysis = 'wednesday'
date_of_analysis = datetime.datetime.strptime('20190123', "%Y%m%d").date()
todays_services = []

with open('calendar.txt', mode='r', encoding='utf-8-sig') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    line_count = 0
    for row in csv_reader:
        if row[day_of_analysis] == '1':
            start_date = datetime.datetime.strptime(row['start_date'], "%Y%m%d").date()
            end_date = datetime.datetime.strptime(row['end_date'], "%Y%m%d").date()
            if start_date <= date_of_analysis <= end_date:
                todays_services.append(row['service_id'])

print("Todays services = " )
print(', '.join(todays_services))

In [None]:
df = pd.read_csv('trips.txt',
                 header=0,
                 encoding='utf-8-sig',
                 usecols=["route_id", "service_id", "trip_id", "trip_short_name"])
df = df[df['service_id'].isin(todays_services)]
print(df)

Gosh, that's a lot of services... What are RTTA_DEF and RTTA_REV... and are those CountryLink services??? Let's filter out some of these services as we're only going to analyse what is going on in the general Sydney commuter network.

In [None]:
ROUTES_TO_IGNORE = ["CTY_NC1", "CTY_NC1a", "CTY_NC2", "CTY_NW1a", "CTY_NW1b", "CTY_NW1c", "CTY_NW1d",
                    "CTY_NW2a", "CTY_NW2b", "CTY_S1a", "CTY_S1b", "CTY_S1c", "CTY_S1d", "CTY_S1e",
                    "CTY_S1f", "CTY_S1g", "CTY_S1h", "CTY_S1i", "CTY_S2a", "CTY_S2b", "CTY_S2c",
                    "CTY_S2d", "CTY_S2e", "CTY_S2f", "CTY_S2g", "CTY_S2h", "CTY_S2i", "CTY_W1a",
                    "CTY_W1b", "CTY_W2a", "CTY_W2b", "HUN_1a", "HUN_1b", "HUN_2a", "HUN_2b",
                    "RTTA_DEF", "RTTA_REV"]
df = df[~df['route_id'].isin(ROUTES_TO_IGNORE)]
print(df)