<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [9]</a>'.</span>

In [1]:
from utz import *
from numpy import dtype
from tabula import read_pdf

In [2]:
year = 2022
last_month = None
n_jobs = 4
overwrite = False

In [3]:
# Parameters
year = 2015


In [4]:
if year == 2022 and last_month is None:
    last_month = 9  # 2022 data only committed here through September

In [5]:
stations = [
    'Christopher St.',
    '9th Street',
    '14th Street',
    '23rd Street',
    '33rd Street',
    'WTC',
    'Newark',
    'Harrison',
    'Journal Square',
    'Grove Street',
    'Exchange Place',
    'Newport',
    'Hoboken',
    'System-wide',
]
title = 1
contents = 2
ytd = 3
num_stations = len(stations)
section_pages = num_stations + 1  # title page
def month_page_range(month):
    start = 4 + month * section_pages
    end = start + num_stations
    return start, end

print(' '.join([ f'[{", ".join(map(str, month_page_range(month)))})' for month in range(1, 13) ]))

[19, 33) [34, 48) [49, 63) [64, 78) [79, 93) [94, 108) [109, 123) [124, 138) [139, 153) [154, 168) [169, 183) [184, 198)


In [6]:
station_offsets = { station: idx for idx, station in enumerate(stations) }
station_offsets

{'Christopher St.': 0,
 '9th Street': 1,
 '14th Street': 2,
 '23rd Street': 3,
 '33rd Street': 4,
 'WTC': 5,
 'Newark': 6,
 'Harrison': 7,
 'Journal Square': 8,
 'Grove Street': 9,
 'Exchange Place': 10,
 'Newport': 11,
 'Hoboken': 12,
 'System-wide': 13}

In [7]:
template_path = 'templates/2022-PATH-hourly-Ridership-Report.tabula-template.json'
with open(template_path, 'r') as f:
    rects = json.load(f)
rects

[{'page': 19,
  'extraction_method': 'guess',
  'x1': 0.495,
  'x2': 780.615,
  'y1': 126.225,
  'y2': 558.855,
  'width': 780.12,
  'height': 432.63},
 {'page': 19,
  'extraction_method': 'guess',
  'x1': 257.895,
  'x2': 535.095,
  'y1': 14.355,
  'y2': 77.715,
  'width': 277.2,
  'height': 63.36},
 {'page': 19,
  'extraction_method': 'guess',
  'x1': 83.655,
  'x2': 205.425,
  'y1': 98.505,
  'y2': 121.275,
  'width': 121.77,
  'height': 22.77}]

In [8]:
based_on_regex = '\(Based on (?P<month>\w+) (?P<year>\d{4}) Turnstile Count\)'
cross_honor_regex = '\(Cross[‐\-]honor (?:Entry )?Count not Included\)'

def clean(s):
    """Some years have "‐" (ord("‐") == 8028) instead of "-" (ord("-") == 45) in various titles/messages."""
    return s.replace('‐', '-')

def read_station_month_hours_tables(year, month, station):
    station_offset = station_offsets[station]
    pdf = f'data/{year}-PATH-hourly-Ridership-Report.pdf'
    start, _ = month_page_range(month)
    pg = start + station_offset
    month_name = to_dt('%d-%02d' % (year, month)).strftime('%B')
    print(f'Reading {pdf}, pg. {pg}: {month_name}, {station}')
    tables = [
        read_pdf(
            pdf,
            pages=pg,
            area=[ rect[k] for k in [ 'y1', 'x1', 'y2', 'x2', ] ],
            pandas_options={'header': None},
        )
        for rect in rects
    ]
    return tables

def to_hour(r):
    hour, AM = r['hour'], r['am'] == 'AM'
    return (0 if hour == 12 else hour) + (0 if AM else 12)

def parse_station_month_hours_tables(year, month, station):
    [hrs], [header], [actual_station] = read_station_month_hours_tables(year, month, station)
    [[actual_station]] = actual_station.values
    actual_station = clean(actual_station)  # "System-wide" can have either dash character
    if actual_station != station:
        raise RuntimeError(f"Parsed station {actual_station} != {station}")

    [[title], [based_on_msg], [cross_msg]] = header.values
    if clean(title) != 'PATH - Average Hourly Entry and Exit Counts by Station':
        raise RuntimeError(f'Unexpected title: "{title}"')

    m = fullmatch(based_on_regex, based_on_msg)
    if not m:
        raise RuntimeError(f'Unrecognized "based on" message: "{based_on_msg}"')
    parsed_year = int(m['year'])
    if year != parsed_year:
        raise RuntimeError(f"Parsed year {parsed_year} != {year}")
    parsed_month = m['month']
    month_name = to_dt('%d-%02d' % (year, month)).strftime('%B')
    if parsed_month != month_name:
        raise RuntimeError(f"Parsed month {parsed_month} != {month}")

    if not fullmatch(cross_honor_regex, cross_msg):
        raise RuntimeError(f'Unexpected cross-honor message: "{cross_msg}"')

    hrs = hrs.dropna(axis=1, how='all')
    headers = (hrs.iloc[0].fillna('') + ' ' + hrs.iloc[1]).str.strip()
    hrs = hrs.copy().iloc[2:]
    hrs = hrs.dropna(axis=1, how='all')
    headers = headers.dropna()
    hrs.columns = headers
    hrs['Year'] = year
    hrs['Month'] = month
    hrs['Station'] = station
    hrs = hrs[['Year', 'Month', 'Station'] + headers.tolist()]
    for k in hrs.columns[4:]:
        col = hrs[k]
        dt = col.dtype
        if dt == dtype('O'):
            hrs[k] = hrs[k].str.replace(',', '').astype(int)
        elif dt == dtype('float64'):
            hrs[k] = hrs[k].astype(int)
        elif dt == dtype('int64'):
            pass
        else:
            raise RuntimeError(f'Unexpected dtype, col {k}: {dt}')

    total_rows = hrs.Hour == 'Total'
    totals = hrs[total_rows]
    hrs = hrs[~total_rows]

    Hour = hrs.Hour.str.extract('(?P<hour>\d\d?):00:00 (?P<am>AM|PM)').astype({ 'hour': int }).apply(to_hour, axis=1)
    hrs['Hour'] = Hour

    system_wide_rows = hrs.Station == 'System-wide'
    system_wide = hrs[system_wide_rows]
    hrs = hrs[~system_wide_rows]

    return hrs, totals, system_wide

def read_month_hours_stations(year, month, n_jobs=None, concat=True):
    if n_jobs:
        parallel = Parallel(n_jobs=n_jobs)
        fn = delayed(parse_station_month_hours_tables)
        rvs = parallel(
            fn(year, month, station)
            for station in stations
        )
    else:
        rvs = [
            parse_station_month_hours_tables(year, month, station)
            for station in stations
        ]
    if concat:
        return [ pd.concat(dfs) for dfs in zip(*rvs) ]
    else:
        return dfs

def read_year_hours_stations(year, last_month=None, n_jobs=None, concat=True):
    rvs = [
        read_month_hours_stations(year, month, n_jobs=n_jobs)
        for month in range(1, 13 if last_month is None else (last_month + 1))
    ]
    if concat:
        return [ pd.concat(dfs) for dfs in zip(*rvs) ]
    else:
        return rvs

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [9]:
%%time
suffixes = ['', '-total', '-system']
base = f'data/{year}-hourly'
paths = [ f'{base}{suffix}.pqt' for suffix in suffixes ]
extant = list(filter(exists, paths))

if extant and overwrite:
    print(f'Overwriting {", ".join(extant)}')

hrs, totals, system_wide = None, None, None
if extant != paths or overwrite:
    dfs = read_year_hours_stations(year, last_month=last_month, n_jobs=n_jobs)
    for df, path in zip(dfs, paths):
        print(f'Writing {path}')
        df.to_parquet(path, index=False)
    hrs, totals, system_wide = dfs

hrs

FileNotFoundError: [Errno 2] No such file or directory: 'data/2015-PATH-hourly-Ridership-Report.pdf'

In [10]:
totals

In [11]:
system_wide