In [4]:
from utz import *

from tabula import read_pdf

from numpy import dtype

In [None]:
year = 2022
last_month = None

In [61]:
stations = [
    'Christopher St.',
    '9th Street',
    '14th Street',
    '23rd Street',
    '33rd Street',
    'WTC',
    'Newark',
    'Harrison',
    'Journal Square',
    'Grove Street',
    'Exchange Place',
    'Newport',
    'Hoboken',
    'System-wide',
]
title = 1
contents = 2
ytd = 3
num_stations = len(stations)
section_pages = num_stations + 1  # title page
def month_page_range(month):
    start = 4 + month * section_pages
    end = start + num_stations
    return start, end

[ month_page_range(month) for month in range(1, 13) ]

[(19, 33),
 (34, 48),
 (49, 63),
 (64, 78),
 (79, 93),
 (94, 108),
 (109, 123),
 (124, 138),
 (139, 153),
 (154, 168),
 (169, 183),
 (184, 198)]

In [62]:
station_offsets = { station: idx for idx, station in enumerate(stations) }
station_offsets

{'Christopher St.': 0,
 '9th Street': 1,
 '14th Street': 2,
 '23rd Street': 3,
 '33rd Street': 4,
 'WTC': 5,
 'Newark': 6,
 'Harrison': 7,
 'Journal Square': 8,
 'Grove Street': 9,
 'Exchange Place': 10,
 'Newport': 11,
 'Hoboken': 12,
 'System-wide': 13}

In [5]:
template_path = 'templates/2022-PATH-hourly-Ridership-Report.tabula-template.json'
with open(template_path, 'r') as f:
    rects = json.load(f)
rects

[{'page': 19,
  'extraction_method': 'guess',
  'x1': 0.495,
  'x2': 780.615,
  'y1': 126.225,
  'y2': 558.855,
  'width': 780.12,
  'height': 432.63},
 {'page': 19,
  'extraction_method': 'guess',
  'x1': 257.895,
  'x2': 535.095,
  'y1': 14.355,
  'y2': 77.715,
  'width': 277.2,
  'height': 63.36},
 {'page': 19,
  'extraction_method': 'guess',
  'x1': 83.655,
  'x2': 205.425,
  'y1': 98.505,
  'y2': 121.275,
  'width': 121.77,
  'height': 22.77}]

In [150]:
based_on_regex = '\(Based on (?P<month>\w+) (?P<year>\d{4}) Turnstile Count\)'

def read_station_month_hours_tables(year, month, station):
    station_offset = station_offsets[station]
    pdf = f'data/{year}-PATH-hourly-Ridership-Report.pdf'
    start, _ = month_page_range(month)
    pg = start + station_offset
    month_name = to_dt('%d-%02d' % (year, month)).strftime('%B')
    print(f'Reading {pdf}, pg. {pg}: {month_name}, {station}')
    tables = [
        read_pdf(
            pdf,
            pages=pg,
            area=[ rect[k] for k in [ 'y1', 'x1', 'y2', 'x2', ] ],
            pandas_options={'header': None},
        )
        for rect in rects
    ]
    return tables

def parse_station_month_hours_tables(year, month, station):
    [body], [header], [actual_station] = read_station_month_hours_tables(year, month, station)
    [[actual_station]] = actual_station.values
    if actual_station != station:
        raise RuntimeError(f"Parsed station {actual_station} != {station}")

    [[title], [based_on_msg], [cross_msg]] = header.values
    if title != 'PATH - Average Hourly Entry and Exit Counts by Station':
        raise RuntimeError(f'Unexpected title: {title}')

    m = fullmatch(based_on_regex, based_on_msg)
    if not m:
        raise RuntimeError(f"Unrecognized 'based on' message: {based_on_msg}")
    parsed_year = int(m['year'])
    if year != parsed_year:
        raise RuntimeError(f"Parsed year {parsed_year} != {year}")
    parsed_month = m['month']
    month_name = to_dt('%d-%02d' % (year, month)).strftime('%B')
    if parsed_month != month_name:
        raise RuntimeError(f"Parsed month {parsed_month} != {month}")

    if cross_msg != '(Cross-honor Entry Count not Included)':
        raise RuntimeError(f'Unexpected cross-honor message: {cross_msg}')

    body = body.dropna(axis=1, how='all')
    headers = (body.iloc[0].fillna('') + ' ' + body.iloc[1]).str.strip()
    body = body.copy().iloc[2:]
    body = body.dropna(axis=1, how='all')
    headers = headers.dropna()
    body.columns = headers
    body['Year'] = year
    body['Month'] = month
    body['Station'] = station
    body = body[['Year', 'Month', 'Station'] + headers.tolist()]
    for k in body.columns[4:]:
        col = body[k]
        dt = col.dtype
        if dt == dtype('O'):
            body[k] = body[k].str.replace(',', '').astype(int)
        elif dt == dtype('float64'):
            body[k] = body[k].astype(int)
        elif dt == dtype('int64'):
            pass
        else:
            raise RuntimeError(f'Unexpected dtype, col {k}: {dt}')
    
    return body

def read_month_hours_stations(year, month, n_jobs=None, concat=True):
    if n_jobs:
        parallel = Parallel(n_jobs=n_jobs)
        fn = delayed(parse_station_month_hours_tables)
        dfs = parallel(
            fn(year, month, station)
            for station in stations
        )
    else:
        dfs = [
            parse_station_month_hours_tables(year, month, station)
            for station in stations
        ]
    if concat:
        return pd.concat(dfs)
    else:
        return dfs

def read_year_hours_stations(year, last_month=None, n_jobs=None, concat=True):
    months = [
        read_month_hours_stations(year, month, n_jobs=n_jobs)
        for month in range(1, 13 if last_month is None else (last_month + 1))
    ]
    if concat:
        return pd.concat(months)
    else:
        return months

In [151]:
%%time
hrs22 = read_year_hours_stations(2022, last_month=9, n_jobs=4, concat=False)
hrs22

CPU times: user 142 ms, sys: 60.3 ms, total: 202 ms
Wall time: 1min 27s


[    Year  Month          Station         Hour  Avg Weekday Entry  \
 2   2022      1  Christopher St.  12:00:00 AM                 34   
 3   2022      1  Christopher St.   1:00:00 AM                 13   
 4   2022      1  Christopher St.   2:00:00 AM                 14   
 5   2022      1  Christopher St.   3:00:00 AM                  6   
 6   2022      1  Christopher St.   4:00:00 AM                  7   
 ..   ...    ...              ...          ...                ...   
 22  2022      1      System-wide   8:00:00 PM               2653   
 23  2022      1      System-wide   9:00:00 PM               1961   
 24  2022      1      System-wide  10:00:00 PM               1718   
 25  2022      1      System-wide  11:00:00 PM               1060   
 26  2022      1      System-wide        Total              84185   
 
     Avg Saturday Entry  Avg Sunday Entry  Avg Weekday Exit  Avg Saturday Exit  \
 2                   85                83                11                 18   
 3    

In [152]:
all22 = pd.concat(hrs22)
all22

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,1,Christopher St.,12:00:00 AM,34,85,83,11,18,18,33,4
3,2022,1,Christopher St.,1:00:00 AM,13,59,63,6,12,11,21,1
4,2022,1,Christopher St.,2:00:00 AM,14,56,41,4,6,7,16,4
5,2022,1,Christopher St.,3:00:00 AM,6,27,20,8,8,6,9,5
6,2022,1,Christopher St.,4:00:00 AM,7,22,19,10,7,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
21,2022,9,System-wide,7:00:00 PM,8850,6867,4543,11099,8559,5914,3981,5523
22,2022,9,System-wide,8:00:00 PM,5950,5949,3806,7900,7978,5189,3080,4537
23,2022,9,System-wide,9:00:00 PM,4611,4959,2911,6452,6968,4499,2300,3807
24,2022,9,System-wide,10:00:00 PM,3466,4330,2209,4771,5888,3401,1772,2871


In [153]:
all22.to_parquet('data/2022-hourly.pqt', index=False)

In [154]:
all222 = read_parquet('data/2022-hourly.pqt')
all222

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
0,2022,1,Christopher St.,12:00:00 AM,34,85,83,11,18,18,33,4
1,2022,1,Christopher St.,1:00:00 AM,13,59,63,6,12,11,21,1
2,2022,1,Christopher St.,2:00:00 AM,14,56,41,4,6,7,16,4
3,2022,1,Christopher St.,3:00:00 AM,6,27,20,8,8,6,9,5
4,2022,1,Christopher St.,4:00:00 AM,7,22,19,10,7,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
3103,2022,9,System-wide,7:00:00 PM,8850,6867,4543,11099,8559,5914,3981,5523
3104,2022,9,System-wide,8:00:00 PM,5950,5949,3806,7900,7978,5189,3080,4537
3105,2022,9,System-wide,9:00:00 PM,4611,4959,2911,6452,6968,4499,2300,3807
3106,2022,9,System-wide,10:00:00 PM,3466,4330,2209,4771,5888,3401,1772,2871


In [155]:
all222.dtypes

Year                    int64
Month                   int64
Station                object
Hour                   object
Avg Weekday Entry       int64
Avg Saturday Entry      int64
Avg Sunday Entry        int64
Avg Weekday Exit        int64
Avg Saturday Exit       int64
Avg Sunday Exit         int64
Avg Holiday Entries     int64
Avg Holiday Exits       int64
dtype: object

In [156]:
all22.dtypes

Year                    int64
Month                   int64
Station                object
Hour                   object
Avg Weekday Entry       int64
Avg Saturday Entry      int64
Avg Sunday Entry        int64
Avg Weekday Exit        int64
Avg Saturday Exit       int64
Avg Sunday Exit         int64
Avg Holiday Entries     int64
Avg Holiday Exits       int64
dtype: object

In [86]:
%%time
jan22 = read_month_hours_stations(2022, 1, n_jobs=4)
jan22

CPU times: user 20.6 ms, sys: 5.74 ms, total: 26.4 ms
Wall time: 9.74 s


Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,1,Christopher St.,12:00:00 AM,34,85,83,11,18,18,33,4
3,2022,1,Christopher St.,1:00:00 AM,13,59,63,6,12,11,21,1
4,2022,1,Christopher St.,2:00:00 AM,14,56,41,4,6,7,16,4
5,2022,1,Christopher St.,3:00:00 AM,6,27,20,8,8,6,9,5
6,2022,1,Christopher St.,4:00:00 AM,7,22,19,10,7,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
22,2022,1,System-wide,8:00:00 PM,2653,2045,1717,3614,2747,2395,1844,2582
23,2022,1,System-wide,9:00:00 PM,1961,1685,1385,2582,2193,1881,1315,1930
24,2022,1,System-wide,10:00:00 PM,1718,1593,1095,2294,2105,1624,1020,1537
25,2022,1,System-wide,11:00:00 PM,1060,1272,679,1475,1876,1182,637,1138


In [120]:
%%time
feb22 = read_month_hours_stations(2022, 2, n_jobs=8)
feb22

Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 49: March, Christopher St.
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 51: March, 14th Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 54: March, WTC
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 55: March, Newark
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 58: March, Grove Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 60: March, Newport
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 61: March, Hoboken
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 50: March, 9th Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 52: March, 23rd Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 53: March, 33rd Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 56: March, Harrison
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 57: March, Journal Square
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 59: March, Ex

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,2,Christopher St.,12:00:00 AM,44,138,165,9,32,49,67,15
3,2022,2,Christopher St.,1:00:00 AM,20,88,115,8,18,22,36,8
4,2022,2,Christopher St.,2:00:00 AM,20,67,73,5,10,9,30,14
5,2022,2,Christopher St.,3:00:00 AM,7,41,50,7,9,9,21,3
6,2022,2,Christopher St.,4:00:00 AM,7,28,38,5,4,5,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...
22,2022,2,System-wide,8:00:00 PM,3800,3492,2089,4860,4619,2826,2853,3686
23,2022,2,System-wide,9:00:00 PM,2956,2761,1767,3731,3592,2269,2077,2961
24,2022,2,System-wide,10:00:00 PM,2253,2448,1834,2971,3246,2269,1560,2244
25,2022,2,System-wide,11:00:00 PM,1412,2218,1104,1972,2850,1873,1255,2035


In [119]:
%%time
mar22 = read_month_hours_stations(2022, 3, n_jobs=2)
mar22

CPU times: user 21.6 ms, sys: 28.2 ms, total: 49.8 ms
Wall time: 13.8 s


Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,3,Christopher St.,12:00:00 AM,49,181,166,10,56,52,0.0,0.0
3,2022,3,Christopher St.,1:00:00 AM,23,111,94,6,20,19,0.0,0.0
4,2022,3,Christopher St.,2:00:00 AM,21,94,67,7,11,11,0.0,0.0
5,2022,3,Christopher St.,3:00:00 AM,7,54,55,7,9,9,0.0,0.0
6,2022,3,Christopher St.,4:00:00 AM,6,34,50,5,4,5,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
21,2022,3,System-wide,7:00:00 PM,7086,4670,3304,8478,5490,4005,0.0,0.0
22,2022,3,System-wide,8:00:00 PM,4563,3662,2579,5851,4998,3485,0.0,0.0
23,2022,3,System-wide,9:00:00 PM,3463,3097,2161,4346,4076,2799,0.0,0.0
24,2022,3,System-wide,10:00:00 PM,2647,2760,1572,3565,3689,2326,0.0,0.0


In [121]:
pd.concat([ feb22, mar22 ])

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,2,Christopher St.,12:00:00 AM,44,138,165,9,32,49,67,15
3,2022,2,Christopher St.,1:00:00 AM,20,88,115,8,18,22,36,8
4,2022,2,Christopher St.,2:00:00 AM,20,67,73,5,10,9,30,14
5,2022,2,Christopher St.,3:00:00 AM,7,41,50,7,9,9,21,3
6,2022,2,Christopher St.,4:00:00 AM,7,28,38,5,4,5,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...
21,2022,3,System-wide,7:00:00 PM,7086,4670,3304,8478,5490,4005,0.0,0.0
22,2022,3,System-wide,8:00:00 PM,4563,3662,2579,5851,4998,3485,0.0,0.0
23,2022,3,System-wide,9:00:00 PM,3463,3097,2161,4346,4076,2799,0.0,0.0
24,2022,3,System-wide,10:00:00 PM,2647,2760,1572,3565,3689,2326,0.0,0.0


In [81]:
year, month = 2022, 1
jan22s = [
    parse_station_month_hours_tables(year, month, station)
    for station in stations
]

Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 19: January, Christopher St.
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 20: January, 9th Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 21: January, 14th Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 22: January, 23rd Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 23: January, 33rd Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 24: January, WTC
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 25: January, Newark
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 26: January, Harrison
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 27: January, Journal Square
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 28: January, Grove Street
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 29: January, Exchange Place
Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 30: January, Newport
Reading data/2022-PATH-hourly-Ridershi

In [82]:
pd.concat(jan22s)

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,1,Christopher St.,12:00:00 AM,34,85,83,11,18,18,33,4
3,2022,1,Christopher St.,1:00:00 AM,13,59,63,6,12,11,21,1
4,2022,1,Christopher St.,2:00:00 AM,14,56,41,4,6,7,16,4
5,2022,1,Christopher St.,3:00:00 AM,6,27,20,8,8,6,9,5
6,2022,1,Christopher St.,4:00:00 AM,7,22,19,10,7,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
22,2022,1,System-wide,8:00:00 PM,2653,2045,1717,3614,2747,2395,1844,2582
23,2022,1,System-wide,9:00:00 PM,1961,1685,1385,2582,2193,1881,1315,1930
24,2022,1,System-wide,10:00:00 PM,1718,1593,1095,2294,2105,1624,1020,1537
25,2022,1,System-wide,11:00:00 PM,1060,1272,679,1475,1876,1182,637,1138


In [110]:
[body], [header], [actual_station] = read_station_month_hours_tables(2022, 3, 'Harrison')
body

Reading data/2022-PATH-hourly-Ridership-Report.pdf, pg. 56: March, Harrison


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,Avg Weekday,Avg Saturday,Avg Sunday,,Avg Weekday,Avg Saturday,Avg Sunday,Avg Holiday,,Avg Holiday,
1,Hour,Entry,Entry,Entry,,Exit,Exit,Exit,Entries,,Exits,
2,12:00:00 AM,2,6,5,,68,142,105,,0.0,,0.0
3,1:00:00 AM,1,2,3,,26,62,51,,0.0,,0.0
4,2:00:00 AM,1,2,2,,16,74,46,,0.0,,0.0
5,3:00:00 AM,10,4,2,,7,22,28,,0.0,,0.0
6,4:00:00 AM,34,13,5,,16,30,21,,0.0,,0.0
7,5:00:00 AM,221,34,11,,21,22,20,,0.0,,0.0
8,6:00:00 AM,491,62,38,,57,21,14,,0.0,,0.0
9,7:00:00 AM,961,80,55,,84,24,16,,0.0,,0.0


In [111]:
#body = body.dropna(axis=1, how='all')
headers = (body.iloc[0].fillna('') + ' ' + body.iloc[1]).str.strip()
body = body.copy().iloc[2:]
body

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
2,12:00:00 AM,2,6,5,,68,142,105,,0.0,,0.0
3,1:00:00 AM,1,2,3,,26,62,51,,0.0,,0.0
4,2:00:00 AM,1,2,2,,16,74,46,,0.0,,0.0
5,3:00:00 AM,10,4,2,,7,22,28,,0.0,,0.0
6,4:00:00 AM,34,13,5,,16,30,21,,0.0,,0.0
7,5:00:00 AM,221,34,11,,21,22,20,,0.0,,0.0
8,6:00:00 AM,491,62,38,,57,21,14,,0.0,,0.0
9,7:00:00 AM,961,80,55,,84,24,16,,0.0,,0.0
10,8:00:00 AM,1031,106,69,,80,35,13,,0.0,,0.0
11,9:00:00 AM,454,114,93,,66,33,33,,0.0,,0.0


In [115]:
headers = headers.dropna()
headers

0                    Hour
1       Avg Weekday Entry
2      Avg Saturday Entry
3        Avg Sunday Entry
5        Avg Weekday Exit
6       Avg Saturday Exit
7         Avg Sunday Exit
8     Avg Holiday Entries
10      Avg Holiday Exits
dtype: object

In [112]:
body = body.dropna(axis=1, how='all')

In [116]:
body.columns = headers
body['Year'] = year
body['Month'] = month
body['Station'] = station
body = body[['Year', 'Month', 'Station'] + headers.tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  body['Year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  body['Month'] = month


In [117]:
body

Unnamed: 0,Year,Month,Station,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,2022,1,Christopher St.,12:00:00 AM,2,6,5,68,142,105,0.0,0.0
3,2022,1,Christopher St.,1:00:00 AM,1,2,3,26,62,51,0.0,0.0
4,2022,1,Christopher St.,2:00:00 AM,1,2,2,16,74,46,0.0,0.0
5,2022,1,Christopher St.,3:00:00 AM,10,4,2,7,22,28,0.0,0.0
6,2022,1,Christopher St.,4:00:00 AM,34,13,5,16,30,21,0.0,0.0
7,2022,1,Christopher St.,5:00:00 AM,221,34,11,21,22,20,0.0,0.0
8,2022,1,Christopher St.,6:00:00 AM,491,62,38,57,21,14,0.0,0.0
9,2022,1,Christopher St.,7:00:00 AM,961,80,55,84,24,16,0.0,0.0
10,2022,1,Christopher St.,8:00:00 AM,1031,106,69,80,35,13,0.0,0.0
11,2022,1,Christopher St.,9:00:00 AM,454,114,93,66,33,33,0.0,0.0


In [77]:
body.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3,5,6,7,8,10
0,,Avg Weekday,Avg Saturday,Avg Sunday,Avg Weekday,Avg Saturday,Avg Sunday,Avg Holiday,Avg Holiday
1,Hour,Entry,Entry,Entry,Exit,Exit,Exit,Entries,Exits
2,12:00:00 AM,2,3,1,48,71,47,1,41
3,1:00:00 AM,1,3,2,21,74,26,3,12
4,2:00:00 AM,1,2,1,14,53,21,2,8
5,3:00:00 AM,8,3,1,7,26,8,1,14
6,4:00:00 AM,34,6,3,11,20,14,9,6
7,5:00:00 AM,181,23,13,17,10,5,106,9
8,6:00:00 AM,341,42,35,50,18,14,153,25
9,7:00:00 AM,529,52,34,76,24,20,189,59


In [26]:
[body], [header], [station] = tables
[[station]] = station.values
station

'Christopher St.'

In [36]:
[[title], [based_on_msg], [cross_msg]] = header.values
title, based_on_msg, cross_msg

('PATH - Average Hourly Entry and Exit Counts by Station',
 '(Based on January 2022 Turnstile Count)',
 '(Cross-honor Entry Count not Included)')

In [41]:
headers = (body.iloc[0].fillna('') + ' ' + body.iloc[1]).str.strip()
b = body.copy().iloc[2:]
b.columns = headers
b

Unnamed: 0,Hour,Avg Weekday Entry,Avg Saturday Entry,Avg Sunday Entry,Avg Weekday Exit,Avg Saturday Exit,Avg Sunday Exit,Avg Holiday Entries,Avg Holiday Exits
2,12:00:00 AM,34,85,83,11,18,18,33,4
3,1:00:00 AM,13,59,63,6,12,11,21,1
4,2:00:00 AM,14,56,41,4,6,7,16,4
5,3:00:00 AM,6,27,20,8,8,6,9,5
6,4:00:00 AM,7,22,19,10,7,4,4,4
7,5:00:00 AM,2,8,6,15,6,8,0,13
8,6:00:00 AM,11,5,4,132,12,7,9,64
9,7:00:00 AM,35,4,4,193,18,17,6,58
10,8:00:00 AM,65,9,7,299,31,23,11,79
11,9:00:00 AM,36,13,12,185,49,48,19,90


In [27]:
body

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,Avg Weekday,Avg Saturday,Avg Sunday,Avg Weekday,Avg Saturday,Avg Sunday,Avg Holiday,Avg Holiday
1,Hour,Entry,Entry,Entry,Exit,Exit,Exit,Entries,Exits
2,12:00:00 AM,34,85,83,11,18,18,33,4
3,1:00:00 AM,13,59,63,6,12,11,21,1
4,2:00:00 AM,14,56,41,4,6,7,16,4
5,3:00:00 AM,6,27,20,8,8,6,9,5
6,4:00:00 AM,7,22,19,10,7,4,4,4
7,5:00:00 AM,2,8,6,15,6,8,0,13
8,6:00:00 AM,11,5,4,132,12,7,9,64
9,7:00:00 AM,35,4,4,193,18,17,6,58
