In [2]:
from google.transit import gtfs_realtime_pb2
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import contextily as cx
import matplotlib.pyplot as plt
from glob import glob
from tqdm.notebook import tqdm
import zipfile
from pathlib import Path
import py7zr
import csv

LINES_NAME = 'tvarbanan'
LINES_NAME = 'caner-pendeltåg'

## Read in GTFS Static Data

In [3]:
# Stops selected for Tvärbanan
stops = pd.read_csv(f"""./data/2-working-data/{LINES_NAME}-stops.csv""", index_col=0)
stops.stop_id = stops.stop_id.astype(str)

# Change to GeoDataFrame
stops['geometry'] = stops.apply(lambda x: Point(x.stop_lon, x.stop_lat), axis=1)
stops = stops.drop(columns=['location_type', 'stop_lat', 'stop_lon', 'platform_code', 'parent_station'])
stops = gpd.GeoDataFrame(stops, crs='EPSG:4326')

# Plot out and show stops
ax = stops.plot()
cx.add_basemap(ax, crs=stops.crs)

stops.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/2-working-data/caner-pendeltåg-stops.csv'

## Read in all TripUpdates

In [7]:
# Collect all the days in the different months
days = glob("./data/0-koda-gtfs-rt/tu/2021-06-*.7z")[:14]

# Sample
days = ['./data/0-koda-gtfs-rt/tu/2021-06-21.7z',
        './data/0-koda-gtfs-rt/tu/2021-06-22.7z',
        './data/0-koda-gtfs-rt/tu/2021-06-23.7z',
        './data/0-koda-gtfs-rt/tu/2021-06-24.7z',
        './data/0-koda-gtfs-rt/tu/2021-06-25.7z']

days[:5], len(days)

(['./data/0-koda-gtfs-rt/tu/2021-06-21.7z',
  './data/0-koda-gtfs-rt/tu/2021-06-22.7z',
  './data/0-koda-gtfs-rt/tu/2021-06-23.7z',
  './data/0-koda-gtfs-rt/tu/2021-06-24.7z',
  './data/0-koda-gtfs-rt/tu/2021-06-25.7z'],
 5)

In [11]:
iterator = tqdm(days)
for day in iterator:

    path = day.split('/')[4]
    out = f"""TU-{LINES_NAME}-{path[:4]}{path[5:7]}{path[8:10]}"""
    out_path = f"""./data/3-cleaned-data/{out}.csv"""
    
    if os.path.exists(out_path):
        print(out+" exists.")
        continue
    
    with py7zr.SevenZipFile(day, 'r') as archive:
        allfiles = archive.getnames()
        targets = [f for f in allfiles if f.endswith('.pb')]
        iterator.set_description(f"""{out}: Unpacking {len(targets)} files""")
        files = archive.read(targets)

    iterator.set_description(f"""{out}: Reading""")
    tu_data = []
    for target in tqdm(targets, leave=False):

        feed = gtfs_realtime_pb2.FeedMessage()
        feed.ParseFromString(files[target].read())

        for e in feed.entity: 
            tu_data += [
                (e.trip_update.timestamp, 
                 e.trip_update.trip.trip_id,
                 e.trip_update.trip.schedule_relationship,
                 stop_update.stop_id,
                 stop_update.arrival.time, 
                 stop_update.arrival.delay, 
                 stop_update.departure.time,
                 stop_update.departure.delay)
                for stop_update in e.trip_update.stop_time_update
                if stop_update.stop_id in stops.stop_id.to_list()
        ]
    
    iterator.set_description(f"""{out}: Writing CSV with {len(tu_data)} lines""")
    # Incorporate collected data into DataFrame.
    tu_df = pd.DataFrame(tu_data, columns=['timestamp', 'trip_id', 'rel', 'stop_id', 
                                        'arrival_ts', 'arrival_delay', 
                                        'departure_ts', 'departure_delay'])

    tu_df.to_csv(out_path)
    
    del tu_df
    del tu_data

  0%|          | 0/115 [00:00<?, ?it/s]

TU-20211031 exists.
TU-20211211 exists.
TU-20211005 exists.
TU-20210618 exists.
TU-20210608 exists.
TU-20211015 exists.
TU-20211201 exists.
TU-20211021 exists.
TU-20211001 exists.
TU-20211215 exists.
TU-20211221 exists.
TU-20210628 exists.
TU-20211025 exists.
TU-20211205 exists.
TU-20211011 exists.
TU-20211214 exists.
TU-20211220 exists.
TU-20210629 exists.
TU-20211024 exists.
TU-20211204 exists.
TU-20211010 exists.
TU-20211030 exists.
TU-20211224 exists.
TU-20211004 exists.
TU-20210619 exists.
TU-20210609 exists.
TU-20211014 exists.
TU-20211020 exists.
TU-20210626 exists.
TU-20210612 exists.
TU-20211101 exists.
TU-20211111 exists.
TU-20211125 exists.
TU-20210602 exists.
TU-20210616 exists.
TU-20210622 exists.
TU-20211105 exists.
TU-20211121 exists.
TU-20211115 exists.
TU-20210606 exists.
TU-20210617 exists.
TU-20210623 exists.
TU-20211104 exists.
TU-20211130 exists.
TU-20211120 exists.
TU-20211114 exists.
TU-20210607 exists.
TU-20210613 exists.
TU-20211110 exists.
TU-20211124 exists.
