In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import glob

In [None]:
csv_file_glob = 'data/*2023-04-09*.csv'
assert len(glob.glob(csv_file_glob)) == 1
df = pd.read_csv(glob.glob(csv_file_glob)[0])
date_fields = ['fetch_time', 'projected_arrival', 'last_updated', 'door_close', 'scheduled_departure']
for field in date_fields:
    df[field] = pd.to_datetime(df[field], format='ISO8601')

In [None]:
# all values are the same
door_close = df.door_close.mode()[0]
scheduled_departure = df.scheduled_departure.mode()[0]

df['predicted_lateness_min'] = (df.projected_arrival - df.scheduled_departure).dt.total_seconds() / 60
df['actual_lateness_min'] = (df.door_close - df.scheduled_departure).dt.total_seconds() / 60

In [None]:
print('Final error (minutes) in predicted vs actual lateness:', df.iloc[-1].predicted_lateness_min - df.iloc[-1].actual_lateness_min)

In [None]:
def make_graph(df, lateness=False):
    plt.figure(figsize=(10, 6))
    df_official = df[df.api == 'OFFICIAL']
    df_mrazza = df[df.api == 'MRAZZA']
    
    # leave time
    walk_time_min = 10
    ys = [-1, df.iloc[-1].actual_lateness_min + 1]
    ts = [pd.Timedelta(seconds=(y - walk_time_min) * 60) + scheduled_departure for y in ys]

    plots = [
        {
            'x': df_official['fetch_time'],
            'y': 0,
            'color': 'yellow',
            'label': 'schedule',
        },
        {
            'x': df_official['fetch_time'],
            'y': df_official['predicted_lateness_min'],
            'color': 'blue',
            'label': '"official" API',
        },
        {
            'x': df_mrazza['fetch_time'],
            'y': df_mrazza['predicted_lateness_min'],
            'color': 'green',
            'label': 'mrazza API',
        },
        {
            'x': df_official['fetch_time'],
            'y': df_official['actual_lateness_min'],
            'color': 'orange',
            'label': 'actual',
        },
        {
            'x': pd.Series(ts),
            'y': pd.Series(ys),
            'color': 'purple',
        },
    ]
    
    for plot in plots:
        if not lateness:
            plot['y'] += (scheduled_departure - plot['x']).dt.total_seconds() / 60

        sns.lineplot(**plot)

    # actual departure (door close) time
    plt.axvline(door_close, color='red', linestyle='--')

    plt.legend(loc='lower left')
    

make_graph(df, lateness=True)