# NJSP Fatal Crash Plots
This notebook is run as part of the daily update Github Action:
```bash
njsp -cc update_plots
```
It updates plots based on the latest NJSP fatal crash data (in this Git repo).

It also computes an estimate for the number of traffic deaths in the remainder of the current year (which helps make sense of otherwise-incomplete data about the current year).

In [1]:
from utz import *
import json
from utz import plots
import plotly.graph_objects as go
import plotly.express as px
from nj_crashes.paths import PLOTS_DIR, PROJECTED_TOTALS_PATH, ROOT_DIR, RUNDATE_PATH, DB_URI
from njsp.paths import PROJECTED_CSV
from njsp.ytd import Ytd, normalized_ytd_days

[Papermill](https://papermill.readthedocs.io/) parameters:

In [2]:
return_img = None
ytc_fmts = 'csv'  # comma-delimited subset of {csv, pqt, db}

Common settings for plots created later:

In [3]:
save = partial(plots.save, bg='white', return_img=return_img)

## Load most recent NJSP fatal crash data
This table is produced by the `njsp -cc update_pqts` step that precedes this in [the daily Github Action](.github/workflows/daily.yml):

In [4]:
crashes = read_sql_table("crashes", DB_URI)
crashes

Unnamed: 0,ACCID,CCODE,CNAME,MCODE,MNAME,HIGHWAY,LOCATION,FATALITIES,INJURIES,STREET,dt,FATAL_D,FATAL_P,FATAL_T,FATAL_B
0,1703,01,Atlantic,0102,Atlantic City,446,State/Interstate Authority 446 S MP 1,1.0,1.0,,2008-01-01 00:35:00,,,,
1,1681,09,Hudson,0910,Union City,,Bergenline Ave S MP 0 at 6th St,1.0,,Bergenline Ave,2008-01-01 04:11:00,,,,
2,1659,04,Camden,0415,Gloucester Twsp,42,State Highway 42 N MP 8.2,1.0,1.0,,2008-01-01 06:46:00,,,,
3,1661,20,Union,2004,Elizabeth City,624,County 624 W MP 2.2 at Ikea Dr,1.0,1.0,,2008-01-01 12:29:00,,,,
4,1811,07,Essex,0716,Nutley Town,648,County 648 E MP .87 at Franklin Ave,1.0,,,2008-01-01 18:53:00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9033,12987,13,Monmouth,1319,Howell Twsp,34,State Highway 34,1.0,,,2024-02-02 07:47:00,1.0,0.0,0.0,0.0
9034,12982,09,Hudson,0906,Jersey City,,East Linden Ave,1.0,,East Linden Ave,2024-02-03 01:06:00,0.0,1.0,0.0,0.0
9035,12984,21,Warren,2108,Hackettstown Town,604,County 604 MP 1.2,1.0,0.0,,2024-02-05 06:04:00,0.0,0.0,0.0,1.0
9036,12985,14,Morris,1423,Morris Plains Boro,202,State Highway 202,1.0,,,2024-02-06 09:48:00,1.0,0.0,0.0,0.0


Load info about when the NJSP data was most recently updated:

In [5]:
from njsp import Rundate
rundate = Rundate()
print(f'Most recent NJSP run date: {rundate}')
print(f'Most recent month end: {rundate.cur_month_dt}')
print(f'Current year start: {rundate.cur_year_dt}')
print(f'Next year start: {rundate.nxt_year_dt}')

Most recent NJSP run date: 2024-02-10 10:00:03
Most recent month end: 2024-02-01 00:00:00
Current year start: 2024-01-01 00:00:00
Next year start: 2025-01-01 00:00:00


## YTD Calculations
Create series that cumulatively sum year-to-date deaths (as of each day in the dataset history, going back to January 1, 2008).

### Plot YTD counts, for each year ≥2008

In [6]:
years = crashes.dt.dt.year.unique()

#### Color utilities

In [7]:
from utz.colors import RGB, color_interp, colors_lengthen, swatches
from nj_crashes.colors import get_colors, gridcolor, px_colors

colors = get_colors(len(years))
black, red, year_colors = colors.black, colors.red, colors.year_colors
colors

{'black': '#000004', 'red': '#ba3853', 'year_colors': ['#fcffa4', '#f9e56a', '#f7ca36', '#f9ab17', '#f78e0d', '#ef721f', '#e15b31', '#d04643', '#ba3853', '#a22b60', '#882268', '#6f196c', '#55106b', '#3b0c5d', '#200c46', '#0f0626', '#000004']}

In [8]:
cur_year = rundate.year
month_starts = [
    to_dt(f'{cur_year}-{m}').strftime('%b 1')
    for m in range(1, 13)
]
month_starts

['Jan 1',
 'Feb 1',
 'Mar 1',
 'Apr 1',
 'May 1',
 'Jun 1',
 'Jul 1',
 'Aug 1',
 'Sep 1',
 'Oct 1',
 'Nov 1',
 'Dec 1']

In [9]:
ytd = Ytd()

In [10]:
save(
    px.line(
        ytd.ytds,
        x='Text', y='YTD Deaths', color='Year',
        color_discrete_sequence=year_colors,
    ),
    xaxis=dict(
        tickmode='array',
        tickvals=month_starts,
        ticktext=month_starts,
    ),
    legend=dict(traceorder='reversed',),
    #bottom_legend=False,
    title='YTD Traffic Deaths',
    name='ytd-deaths',
    hoverx='x',
    bg='white',
    ygrid='#ccc',
    xgrid='#ccc',
    w=850,
    h=800,
);

![](../www/public/plots/ytd-deaths.png)

## Plot deaths by {year, victim type}

### Group by year

In [11]:
dt = crashes.dt.dt
fatalities_per_year = crashes.FATALITIES.groupby(dt.year).sum().astype(int).rename('NJSP records')

### Group by month

In [12]:
ym = crashes.dt.apply(lambda d: d.strftime('%Y-%m')).rename('ym')
ym

0       2008-01
1       2008-01
2       2008-01
3       2008-01
4       2008-01
         ...   
9033    2024-02
9034    2024-02
9035    2024-02
9036    2024-02
9037    2024-02
Name: ym, Length: 9038, dtype: object

In [13]:
cur_month = rundate.cur_month_dt
fatalities_per_month = crashes[crashes.dt < cur_month].FATALITIES.groupby(ym).sum()
fatalities_per_month

ym
2008-01    59.0
2008-02    40.0
2008-03    33.0
2008-04    50.0
2008-05    46.0
           ... 
2023-09    52.0
2023-10    65.0
2023-11    63.0
2023-12    62.0
2024-01    46.0
Name: FATALITIES, Length: 193, dtype: float64

### Rolling avg

In [14]:
rolling = fatalities_per_month.rolling(12).mean()
rolling

ym
2008-01          NaN
2008-02          NaN
2008-03          NaN
2008-04          NaN
2008-05          NaN
             ...    
2023-09    49.500000
2023-10    50.166667
2023-11    50.666667
2023-12    51.666667
2024-01    51.500000
Name: FATALITIES, Length: 193, dtype: float64

In [15]:
mos = (
    sxs(
        dt.year.rename('year'),
        dt.month.rename('month'),
        crashes.FATALITIES,
    )
    .groupby(['year', 'month']).sum()
)
mos

Unnamed: 0_level_0,Unnamed: 1_level_0,FATALITIES
year,month,Unnamed: 2_level_1
2008,1,59.0
2008,2,40.0
2008,3,33.0
2008,4,50.0
2008,5,46.0
...,...,...
2023,10,65.0
2023,11,63.0
2023,12,62.0
2024,1,46.0


In [16]:
pivoted = mos.reset_index().sort_values(['month', 'year'])
pivoted = pivoted[pivoted.apply(lambda r: to_dt('%d-%02d' % (r.year, r.month)).tz_localize(cur_month.tz) < cur_month, axis=1)]
pivoted

Unnamed: 0,year,month,FATALITIES
0,2008,1,59.0
12,2009,1,57.0
24,2010,1,37.0
36,2011,1,36.0
48,2012,1,52.0
...,...,...,...
143,2019,12,51.0
155,2020,12,47.0
167,2021,12,61.0
179,2022,12,50.0


In [17]:
by_month = crashes.FATALITIES.groupby([dt.year, dt.month]).sum()
by_month

dt    dt
2008  1     59.0
      2     40.0
      3     33.0
      4     50.0
      5     46.0
            ... 
2023  10    65.0
      11    63.0
      12    62.0
2024  1     46.0
      2      8.0
Name: FATALITIES, Length: 194, dtype: float64

### Break out victim "types"

Check victim "type" subtotals vs. total:

In [18]:
fatal_totals = sxs(*[crashes[f'FATAL_{t}'].fillna(0) for t in 'DTPB']).sum(axis=1)
sxs(crashes.dt, (crashes.FATALITIES - fatal_totals).rename('diff')).groupby(dt.year)['diff'].sum()

dt
2008    590.0
2009    584.0
2010    556.0
2011    627.0
2012    589.0
2013    542.0
2014    556.0
2015    562.0
2016    602.0
2017    624.0
2018    563.0
2019    558.0
2020      0.0
2021      0.0
2022      0.0
2023      0.0
2024      0.0
Name: diff, dtype: float64

Cross-reference with annual totals, populate "unknown" subtotal:

In [19]:
base_type_cols_map = {
    'FATAL_D': 'driver',
    'FATAL_T': 'pedestrian',
    'FATAL_P': 'passenger',
    'FATAL_B': 'cyclist',
}
base_type_cols = list(base_type_cols_map.values())

In [20]:
from njsp.paths import ANNUAL_SUMMARIES_YT_CSV

year_stats = read_csv(ANNUAL_SUMMARIES_YT_CSV).astype(int).set_index('year')
year_stats

Unnamed: 0_level_0,driver,passenger,cyclist,pedestrian,crashes
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008,320,112,20,138,555
2009,315,98,14,157,550
2010,303,99,13,141,530
2011,362,105,17,143,586
2012,309,103,14,163,553
2013,304,92,14,132,508
2014,295,80,11,170,523
2015,276,96,17,173,522
2016,330,89,17,166,570
2017,339,85,17,183,591


In [21]:
projected_total = read_csv(PROJECTED_CSV, index_col='county').drop(columns='crashes').sum().sum()
print(f'{projected_total} projected deaths in the rest of {cur_year}')

601 projected deaths in the rest of 2024


In [22]:
year_types = (
    sxs(
        crashes.dt,
        crashes.rename(columns=base_type_cols_map)[base_type_cols].fillna(0)
    )
    .groupby(dt.year.rename('year'))
    .sum(numeric_only=True)
    .astype(int)
)
# Patch in year-types.csv values for [2008, 2020]
year_types.loc[range(2008, 2020)] = year_stats.loc[range(2008, 2020), base_type_cols]

year_types['projected_total'] = fatalities_per_year
year_types.loc[cur_year, 'projected_total'] = projected_total
year_types['projected'] = year_types.projected_total - fatalities_per_year
year_types

Unnamed: 0_level_0,driver,pedestrian,passenger,cyclist,projected_total,projected
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,320,138,112,20,590,0
2009,315,157,98,14,584,0
2010,303,141,99,13,556,0
2011,362,143,105,17,627,0
2012,309,163,103,14,589,0
2013,304,132,92,14,542,0
2014,295,170,80,11,556,0
2015,276,173,96,17,562,0
2016,330,166,89,17,602,0
2017,339,183,85,17,624,0


## Update {year,type,county} stats

In [23]:
from njsp.paths import ANNUAL_SUMMARIES_YTC_CSV

Load {year,type,county} subtotals from annual summary PDFs (see [NJSP summary PDFs.ipynb](data/njsp/annual-summaries/NJSP%20summary%20PDFs.ipynb)):

In [24]:
ytc0 = read_csv(ANNUAL_SUMMARIES_YTC_CSV).set_index(['year', 'county']).astype(int)
ytc0

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,Atlantic,17,8,0,6,30
2008,Bergen,10,5,1,7,22
2008,Burlington,23,6,4,12,45
2008,Camden,25,4,0,15,42
2008,Cape May,8,3,0,0,11
...,...,...,...,...,...,...
2023,Salem,8,2,0,2,11
2023,Somerset,14,4,0,6,22
2023,Sussex,6,2,0,1,6
2023,Union,13,6,2,15,34


Generate a similar dataframe from crash records:

In [25]:
ytc1 = (
    crashes
    .assign(year=dt.year, crashes=1)
    [dt.year >= 2020]
    .rename(columns=dict(
        CNAME='county',
        **base_type_cols_map
    ))
    [['year', 'county'] + ytc0.columns.tolist()]
    .groupby(['year', 'county'])
    .sum(numeric_only=True)
    .astype(int)
)
ytc1

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020,Atlantic,26,5,0,9,38
2020,Bergen,14,9,0,20,38
2020,Burlington,26,4,3,9,40
2020,Camden,19,5,1,13,36
2020,Cape May,5,0,1,3,8
...,...,...,...,...,...,...
2024,Morris,1,0,0,0,1
2024,Ocean,2,0,0,4,6
2024,Passaic,0,0,0,1,1
2024,Union,0,0,0,1,1


Verify they match (for years ≥2020, where they overlap):

In [26]:
m = ytc0.merge(ytc1, left_index=True, right_index=True)
m.columns = pd.MultiIndex.from_tuples([ (c[-1], c[:-1]) for c in m.columns ])
diffs = m['x'] != m['y']
has_diffs = diffs.any().any()
if has_diffs:
    xd = m['x'].loc[diffs.any(axis=1), diffs.any()]
    xd.columns = pd.MultiIndex.from_tuples([ ('x', c) for c in xd.columns ])
    yd = m['y'].loc[diffs.any(axis=1), diffs.any()]
    yd.columns = pd.MultiIndex.from_tuples([ ('y', c) for c in yd.columns ])
    diffs = sxs(xd, yd)
else:
    diffs = None
diffs

In [27]:
assert not has_diffs, diffs

Combine:

In [28]:
y0 = ytc0.index.levels[0]
y1 = ytc1.index.levels[0]
ytc = pd.concat([
    ytc0.drop(index=y0[y0.isin(y1)], level=0),
    ytc1,
])
ytc

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,Atlantic,17,8,0,6,30
2008,Bergen,10,5,1,7,22
2008,Burlington,23,6,4,12,45
2008,Camden,25,4,0,15,42
2008,Cape May,8,3,0,0,11
...,...,...,...,...,...,...
2024,Morris,1,0,0,0,1
2024,Ocean,2,0,0,4,6
2024,Passaic,0,0,0,1,1
2024,Union,0,0,0,1,1


Export:

In [29]:
from njsp.paths import YTC_CSV, YTC_PQT, YTC_DB, YTC_DB_URI

In [30]:
for ytc_fmt in ytc_fmts.split(','):
    if ytc_fmt == 'csv':
        err(f'Writing {YTC_CSV}')
        ytc.to_csv(YTC_CSV)
    elif ytc_fmt == 'pqt':
        err(f'Writing {YTC_PQT}')
        ytc.to_parquet(YTC_PQT)
    elif ytc_fmt == 'db':
        err(f'Writing {YTC_DB}')
        ytc.to_sql('ytc', YTC_DB_URI, if_exists='replace')
    else:
        raise ValueError(f'Unrecognized ytc_fmt {ytc_fmt}')

Writing /home/runner/work/nj-crashes/nj-crashes/data/njsp/year-type-county.csv


## Fatalities per year (by type)

In [31]:
ytc = colors_lengthen(px_colors, 7)
print(' '.join(ytc))
swatches(ytc)

#000004 #320c56 #781c6d #ba3853 #ed6925 #f9b621 #fcffa4


<span style="font-family: monospace">#000004 <span style="color: #000004">██████</span></span> <span style="font-family: monospace">#320c56 <span style="color: #320c56">██████</span></span> <span style="font-family: monospace">#781c6d <span style="color: #781c6d">██████</span></span> <span style="font-family: monospace">#ba3853 <span style="color: #ba3853">██████</span></span> <span style="font-family: monospace">#ed6925 <span style="color: #ed6925">██████</span></span> <span style="font-family: monospace">#f9b621 <span style="color: #f9b621">██████</span></span> <span style="font-family: monospace">#fcffa4 <span style="color: #fcffa4">██████</span></span>

In [32]:
type_cols = [
    'cyclist',
    'driver',
    'pedestrian',
    'passenger',
    'projected',
]
type_cols_map = {
    c: f'{c[0].upper()}{c[1:]}{"s" if c != "projected" else ""}'
    for c in type_cols
}
type_cols_map

{'cyclist': 'Cyclists',
 'driver': 'Drivers',
 'pedestrian': 'Pedestrians',
 'passenger': 'Passengers',
 'projected': 'Projected'}

In [33]:
fig = (
    px.bar(
        year_types[type_cols].rename(columns=type_cols_map).replace(0, nan),
        barmode='stack',
        color_discrete_sequence=ytc[1:],
        text_auto='%d',
    )
    .update_yaxes(
        gridcolor=gridcolor,
        dtick=50,
    )
)
for year, projected_total in year_types.projected_total.to_dict().items():
    fig.add_annotation(
        x=year, y=projected_total,
        text=projected_total,
        showarrow=False,
        yshift=10,
    )
save(
    fig,
    title=f'NJ Traffic Deaths per Year (by victim type)',
    name='fatalities_per_year_by_type',
    hoverx=True,
    w=1600,
    h=800,
);

![](../www/public/plots/fatalities_per_year_by_type.png)

## Fatalities per month (by victim type)

In [34]:
month_types = (
    sxs(
        crashes.dt,
        crashes.rename(columns=base_type_cols_map)[base_type_cols].fillna(0)
    )
    [ dt.year >= 2020 ]
    .groupby([
        dt.year.rename('year'),
        dt.month.rename('month'),
    ])
    [base_type_cols]
    .sum()
    .astype(int)
)

month_types = month_types.reset_index()
month_types['dt'] = (
    month_types
    [['year', 'month']]
    .apply(lambda r: '%04d-%02d' % (r['year'], r['month']), axis=1)
)
month_types = month_types.set_index('dt').drop(columns=['year', 'month'])
month_types

Unnamed: 0_level_0,driver,pedestrian,passenger,cyclist
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01,21,18,8,2
2020-02,15,17,2,0
2020-03,11,16,8,1
2020-04,17,8,2,1
2020-05,28,13,9,2
2020-06,30,8,9,0
2020-07,30,19,8,5
2020-08,31,8,14,1
2020-09,31,21,5,0
2020-10,33,17,5,3


In [35]:
type_colors = colors_lengthen(px_colors, 7)

fig = px.line(
    month_types.rename(columns=type_cols_map).loc[to_dt(month_types.index) < cur_month],
    labels={'variable': '',},
    color_discrete_sequence=type_colors,
)
fig.update_traces(line=dict(width=3))
save(
    fig,
    title='NJ Traffic Deaths per Month (by victim type)',
    name='fatalities_per_month_by_type',
    hoverx=True,
    xgrid=gridcolor,
    xaxis=dict(
        tickformat="%b '%y",
    ),
    w=800,
);

![](../www/public/plots/fatalities_per_month_by_type.png)

## Fatalities per month

In [36]:
fig = go.Figure()
fig.add_trace(go.Bar(x=fatalities_per_month.index, y=fatalities_per_month.values, name='Fatalities', marker_color=red))
fig.add_trace(go.Scatter(x=rolling.index, y=rolling.apply(partial(round, ndigits=1)), name='12mo avg', line={'width': 4, 'color': black, }))
fig.update_yaxes(gridcolor=gridcolor)
save(
    fig,
    title='NJ Traffic Deaths per Month',
    name='fatalities_per_month',
    hoverx=True,
    w=1200, h=600,
);

![](../www/public/plots/fatalities_per_month.png)

In [37]:
month_names = [ to_dt('2022-%02d' % i).strftime('%b') for i in range(1, 13) ]
print(' '.join(month_names))

Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec


In [38]:
fig = px.bar(
    x = pivoted.month,
    y = pivoted.FATALITIES,
    color = pivoted.year.astype(str),
    color_discrete_sequence=year_colors,
    labels=dict(color='', x='', y='',),
    barmode='group',
).update_yaxes(
    gridcolor=gridcolor,
)
save(
    fig,
    title='NJ Traffic Deaths, by Month',
    name='fatalities_by_month_bars',
    legend=dict(traceorder='reversed'),
    xaxis=dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = month_names,
    ),
    hoverx=True,
    w=1200, h=700,
);

![](../www/public/plots/fatalities_by_month_bars.png)

In [39]:
fig = px.line(
    x = pivoted.month,
    y = pivoted.FATALITIES,
    color = pivoted.year,
    color_discrete_sequence=year_colors,
    labels={ 'color': '', 'x': '', 'y': '' },
).update_yaxes(
    gridcolor=gridcolor,
)
save(
    fig,
    title='NJ Traffic Deaths by Month',
    name='fatalities_by_month_lines',
    xaxis=dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = month_names,
    ),
    legend=dict(traceorder='reversed'),
    hoverx=True,
    w=1200, h=700,
);

![](../www/public/plots/fatalities_by_month_lines.png)