In [1]:
from utz import *
from tabula import read_pdf
from nj_crashes.paths import DATA_DIR

In [13]:
stats_dir = f'{DATA_DIR}/stats'

In [56]:
def load_rects(tpl_name):
    tpl_path = f'{stats_dir}/{tpl_name}.json'
    with open(tpl_path, 'r') as f:
        tpl = json.load(f)
    return tpl

In [25]:
def load_pdf_tbl(rect, pdf_path):
    [tbl] = read_pdf(pdf_path, area=[ rect[k] for k in [ 'y1', 'x1', 'y2', 'x2', ] ], pages='all',)
    return tbl

In [23]:
[ptccr_rect] = load_rects('ptccr_23.tabula-template')
def load_ptccr(year):
    pdf_path = f'{stats_dir}/ptccr_%02d.pdf' % (year % 100)
    tbls = load_pdf_tbl(ptccr_rect, pdf_path)
    [tbl] = tbls
    tbl['year'] = year
    tbl = tbl.set_index('County')
    return tbl

In [4]:
cur_year = now().year
cur_year

2024

In [5]:
start_year = 2008
missing_years = [ 2008, 2009, 2017, 2018 ]
summaries = pd.concat([
    load_ptccr(year)
    for year in range(start_year, cur_year)
    if year not in missing_years
])
summaries.columns = summaries.columns.str.lower()
summaries = summaries.rename(columns={
    'pedalcyclist': 'cyclist',
})
summaries.index.name = 'county'
summaries.reset_index().set_index(['year', 'county'])
summaries

Unnamed: 0_level_0,driver,passenger,cyclist,pedestrian,fatalities,crashes,year
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Atlantic,10,7,1,6,24,22,2010
Bergen,17,5,0,15,37,36,2010
Burlington,22,7,0,5,34,33,2010
Camden,17,11,3,10,41,37,2010
Cape May,4,1,0,0,5,4,2010
...,...,...,...,...,...,...,...
Somerset,14,4,0,6,24,22,2023
Sussex,6,2,0,1,9,6,2023
Union,13,6,2,15,36,34,2023
Warren,8,1,0,3,12,12,2023


In [6]:
from nj_crashes.paths import DATA_DIR
sp = read_parquet(f'{DATA_DIR}/crashes.pqt')
sp

Unnamed: 0_level_0,CCODE,CNAME,MCODE,MNAME,HIGHWAY,LOCATION,FATALITIES,INJURIES,STREET,FATAL_D,FATAL_P,FATAL_T,FATAL_B,dt
ACCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1703,01,Atlantic,0102,Atlantic City,446,State/Interstate Authority 446 S MP 1,1.0,1.0,,,,,,2008-01-01 00:35:00
1681,09,Hudson,0910,Union City,,Bergenline Ave S MP 0 at 6th St,1.0,,Bergenline Ave,,,,,2008-01-01 04:11:00
1659,04,Camden,0415,Gloucester Twsp,42,State Highway 42 N MP 8.2,1.0,1.0,,,,,,2008-01-01 06:46:00
1661,20,Union,2004,Elizabeth City,624,County 624 W MP 2.2 at Ikea Dr,1.0,1.0,,,,,,2008-01-01 12:29:00
1811,07,Essex,0716,Nutley Town,648,County 648 E MP .87 at Franklin Ave,1.0,,,,,,,2008-01-01 18:53:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12929,12,Middlesex,1205,Edison Twsp,1,State Highway 1,1.0,,,0.0,0.0,1.0,0.0,2024-01-08 02:20:00
12934,15,Ocean,1514,Lakewood Twsp,623,County 623,1.0,,,0.0,0.0,1.0,0.0,2024-01-08 17:34:00
12937,13,Monmouth,1316,Freehold Twsp,9,State Highway 9,1.0,,,0.0,0.0,1.0,0.0,2024-01-09 17:35:00
12940,02,Bergen,0219,Fort Lee Boro,,Bruce Reynolds Blvd,1.0,,Bruce Reynolds Blvd,0.0,0.0,1.0,0.0,2024-01-10 05:09:00


In [7]:
cols = [ 'FATALITIES', 'STREET', 'FATAL_D', 'FATAL_P', 'FATAL_T', 'FATAL_B', ]
y = sp.dt.dt.year.rename('year')
c = sp.CNAME.rename('county')
gb = sp.groupby([y, c])
agg = gb[cols].sum(numeric_only=True).astype(int)
agg['crashes'] = gb.size()
agg = agg.rename(columns={
    'FATALITIES': 'fatalities',
    'FATAL_D': 'driver',
    'FATAL_P': 'passenger',
    'FATAL_T': 'pedestrian',
    'FATAL_B': 'cyclist',
})
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,fatalities,driver,passenger,pedestrian,cyclist,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008,Atlantic,31,0,0,0,0,30
2008,Bergen,23,0,0,0,0,22
2008,Burlington,45,0,0,0,0,45
2008,Camden,44,0,0,0,0,42
2008,Cape May,11,0,0,0,0,11
...,...,...,...,...,...,...,...
2024,Mercer,1,1,0,0,0,1
2024,Middlesex,2,0,0,2,0,2
2024,Monmouth,2,0,1,1,0,2
2024,Ocean,2,0,0,2,0,2


In [8]:
cols = agg.columns.sort_values().tolist()
cols

['crashes', 'cyclist', 'driver', 'fatalities', 'passenger', 'pedestrian']

In [9]:
m = agg.reset_index()
m = m[m.year >= 2020]  # types missing before 2020, in the per-record NJSP data
m = m.merge(summaries.reset_index(), how='left', on=['year', 'county'], suffixes=['_sp', '_stats']).dropna()
m = m.set_index(['year', 'county'])
m = m[m.columns.sort_values()].astype(int)
m.columns = pd.MultiIndex.from_tuples([ tuple(reversed(col.split('_'))) for col in m.columns ])
m = m[m.columns.sort_values()].astype(int)
m

Unnamed: 0_level_0,Unnamed: 1_level_0,sp,sp,sp,sp,sp,sp,stats,stats,stats,stats,stats,stats
Unnamed: 0_level_1,Unnamed: 1_level_1,crashes,cyclist,driver,fatalities,passenger,pedestrian,crashes,cyclist,driver,fatalities,passenger,pedestrian
year,county,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2020,Atlantic,38,0,26,40,5,9,38,0,26,40,5,9
2020,Bergen,38,0,14,43,9,20,38,0,14,43,9,20
2020,Burlington,40,3,26,42,4,9,40,3,26,42,4,9
2020,Camden,36,1,19,38,5,13,36,1,19,38,5,13
2020,Cape May,8,1,5,9,0,3,8,1,5,9,0,3
2020,Cumberland,22,0,14,24,5,5,22,0,14,24,5,5
2020,Essex,39,3,16,45,12,14,39,3,16,45,12,14
2020,Gloucester,33,2,21,35,5,7,33,2,21,35,5,7
2020,Hudson,24,1,11,24,1,11,24,1,11,24,1,11
2020,Hunterdon,12,0,7,12,2,3,12,0,7,12,2,3


In [10]:
assert (m['sp'] == m['stats']).all().all()

In [57]:
swfcs_rects = load_rects('swfcs2_23.tabula-template')
swfcs_rects

[{'page': 1,
  'extraction_method': 'guess',
  'x1': 134.64000000000001,
  'x2': 478.89,
  'y1': 323.2125,
  'y2': 435.6675,
  'width': 344.25,
  'height': 112.455},
 {'page': 1,
  'extraction_method': 'guess',
  'x1': 63.495000000000005,
  'x2': 550.8,
  'y1': 471.6225,
  'y2': 681.9975000000001,
  'width': 487.305,
  'height': 210.375},
 {'page': 1,
  'extraction_method': 'guess',
  'x1': 142.29,
  'x2': 296.82,
  'y1': 105.1875,
  'y2': 196.2225,
  'width': 154.53,
  'height': 91.035}]

In [115]:
def load_swfcs(year):
    pdf_path = f'{stats_dir}/swfcs2_%02d.pdf' % (year % 100)
    [ types, ages, nums, ] = [ load_pdf_tbl(r, pdf_path) for r in swfcs_rects ]
    types = types.rename(columns={'Unnamed: 0': 'type'})
    types['type'] = types['type'].str.lower()
    types = types.set_index('type')
    types = types.rename(index={'pedalcyclist': 'cyclist', 'total': 'fatalities'})
    types.columns = types.columns.astype(int).rename('year')
    nums = nums['FATAL CRASHES'].str.extract(r'(?P<year>\d{4}) - (?P<crashes>\d+)').astype(int).set_index('year')
    return [ nums, types, ages ]

In [116]:
[ num19, type19, age19 ] = load_swfcs(2019)
type19

year,2017,2018,2019
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
driver,339,276,289
passenger,86,95,81
cyclist,16,17,12
pedestrian,183,175,176
fatalities,624,563,558


In [117]:
[ num10, type10, age10 ] = load_swfcs(2010)
type10

year,2008,2009,2010
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
driver,320,315,303
passenger,112,98,99
cyclist,20,14,13
pedestrian,138,157,141
fatalities,590,584,556


In [118]:
missing_year_types = (
    pd.concat([
        sxs(num10, type10.transpose()),
        sxs(num19, type19.transpose()),
    ])
    .drop([2010, 2019])
    .sort_index()
)
missing_year_types

Unnamed: 0_level_0,crashes,driver,passenger,cyclist,pedestrian,fatalities
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,555,320,112,20,138,590
2009,550,315,98,14,157,584
2017,591,339,86,16,183,624
2018,524,276,95,17,175,563


In [131]:
year_stats = summaries.loc['Total'].set_index('year')
year_stats

Unnamed: 0_level_0,driver,passenger,cyclist,pedestrian,fatalities,crashes
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,303,99,13,141,556,530
2011,362,105,17,143,627,586
2012,309,103,14,163,589,553
2013,304,92,14,132,542,508
2014,295,80,11,170,556,523
2015,276,96,17,173,562,522
2016,330,89,17,166,602,570
2019,289,81,12,176,558,524
2020,304,86,18,179,587,550
2021,368,86,26,217,697,667


In [132]:
year_types = pd.concat([ missing_year_types, year_stats ]).sort_index()
year_types

Unnamed: 0_level_0,crashes,driver,passenger,cyclist,pedestrian,fatalities
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,555,320,112,20,138,590
2009,550,315,98,14,157,584
2010,530,303,99,13,141,556
2011,586,362,105,17,143,627
2012,553,309,103,14,163,589
2013,508,304,92,14,132,542
2014,523,295,80,11,170,556
2015,522,276,96,17,173,562
2016,570,330,89,17,166,602
2017,591,339,86,16,183,624


In [133]:
year_types.to_csv(f'{DATA_DIR}/year_types.csv')

In [124]:
import plotly.express as px

In [147]:
fig = px.bar(
    year_types[['driver', 'passenger', 'pedestrian', 'cyclist']],
    labels={
        'value': 'Fatalities',
        'year': 'Year',
        'variable': 'Victim Type',
    },
    color_discrete_map={
        'driver': '#000',
        'passenger': '#333',
        'pedestrian': '#666',
        'cyclist': '#999',
    },
).update_layout(
    hovermode='x',
    plot_bgcolor='white',
).update_yaxes(
    gridcolor='#ccc',
).update_traces(
    hovertemplate=None,
)
for year, fatalities in year_types.fatalities.to_dict().items():
    fig.add_annotation(
        x=year, y=fatalities,
        text=fatalities,
        showarrow=False,
        yshift=10,
    )
fig

In [55]:
sw19.columns

Index(['2017', '2018', '2019'], dtype='object', name='year')

In [53]:
sw19.transpose().loc[[ 2017, 2018 ]]

KeyError: "None of [Int64Index([2017, 2018], dtype='int64', name='year')] are in the [index]"