# NJSP Fatal Crash Plots
This notebook is run as part of the daily update Github Action:
```bash
njsp -cc update_plots
```
It updates plots based on the latest NJSP fatal crash data (in this Git repo).

It also computes an estimate for the number of traffic deaths in the remainder of the current year (which helps make sense of otherwise-incomplete data about the current year).

In [1]:
from utz import *
import json
from utz import plots
import plotly.graph_objects as go
import plotly.express as px
from nj_crashes.paths import PLOTS_DIR, PROJECTED_TOTALS_PATH, ROOT_DIR, RUNDATE_PATH, DB_URI
from nj_crashes.utils import normalized_ytd_days

[Papermill](https://papermill.readthedocs.io/) parameters:

In [2]:
return_img = None

Common settings for plots created later:

In [3]:
save = partial(plots.save, bg='white', return_img=return_img)

## Load most recent NJSP fatal crash data
These tables are produced by the `njsp -cc update_pqts` step that precedes this in the daily Github Action.

In [4]:
totals = read_sql_table("totals", DB_URI).set_index('year')
crashes = read_sql_table("crashes", DB_URI)
totals

Unnamed: 0_level_0,accidents,injuries,fatalities
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008,555,414,590
2009,550,352,584
2010,530,366,556
2011,586,517,627
2012,553,382,589
2013,508,393,542
2014,523,345,556
2015,522,374,562
2016,570,398,602
2017,591,368,624


In [5]:
crashes

Unnamed: 0,ACCID,CCODE,CNAME,MCODE,MNAME,HIGHWAY,LOCATION,FATALITIES,INJURIES,STREET,FATAL_D,FATAL_P,FATAL_T,FATAL_B,dt
0,1703,01,Atlantic,0102,Atlantic City,446,State/Interstate Authority 446 S MP 1,1.0,1.0,,,,,,2008-01-01 00:35:00
1,1681,09,Hudson,0910,Union City,,Bergenline Ave S MP 0 at 6th St,1.0,,Bergenline Ave,,,,,2008-01-01 04:11:00
2,1659,04,Camden,0415,Gloucester Twsp,42,State Highway 42 N MP 8.2,1.0,1.0,,,,,,2008-01-01 06:46:00
3,1661,20,Union,2004,Elizabeth City,624,County 624 W MP 2.2 at Ikea Dr,1.0,1.0,,,,,,2008-01-01 12:29:00
4,1811,07,Essex,0716,Nutley Town,648,County 648 E MP .87 at Franklin Ave,1.0,,,,,,,2008-01-01 18:53:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9022,12963,06,Cumberland,0610,Millville City,,East Broad St,1.0,,East Broad St,0.0,0.0,1.0,0.0,2024-01-26 06:49:00
9023,12964,04,Camden,0435,Waterford Twsp,,Old White Horse Pk,1.0,,Old White Horse Pk,0.0,0.0,1.0,0.0,2024-01-26 21:04:00
9024,12971,12,Middlesex,1215,North Brunswick Tws,,Remsen Ave,1.0,,Remsen Ave,0.0,0.0,1.0,0.0,2024-01-28 05:35:00
9025,12968,02,Bergen,0217,Fair Lawn Boro,507,County 507,1.0,0.0,,0.0,0.0,1.0,0.0,2024-01-28 19:32:00


Load info about when the NJSP data was most recently updated:

In [6]:
with open(RUNDATE_PATH, 'r') as f:
    cur_rundate = to_dt(json.load(f)['rundate'])

rundate_ytd_days = normalized_ytd_days(cur_rundate)
rundate_str = cur_rundate.strftime('%Y-%m-%d')
cur_month = cur_rundate.strftime('%Y-%m')
tz = cur_rundate.tz
cur_month_dt = to_dt(cur_month).tz_localize(tz)
cur_year = cur_month_dt.year
prv_year_dt = to_dt(f'{cur_year - 1}').tz_localize(tz)
cur_year_dt = to_dt(f'{cur_year}').tz_localize(tz)
nxt_year_dt = to_dt(f'{cur_year + 1}').tz_localize(tz)
print(f'Most recent NJSP run date: {cur_rundate}')
print(f'Most recent month end: {cur_month_dt}')
print(f'Current year start: {cur_year_dt}')
print(f'Next year start: {nxt_year_dt}')

Most recent NJSP run date: 2024-01-31 10:00:02
Most recent month end: 2024-01-01 00:00:00
Current year start: 2024-01-01 00:00:00
Next year start: 2025-01-01 00:00:00


## YTD Calculations
Create series that cumulatively sum year-to-date deaths (as of each day in the dataset history, going back to January 1, 2008).

In [7]:
all_days = pd.DataFrame([
    dict(Days=days, Text=(to_dt(f'{2022}') + pd.Timedelta(days=days-1)).strftime('%b %-d'))
    for days in range(1, 366)
]).set_index('Days')
all_days

Unnamed: 0_level_0,Text
Days,Unnamed: 1_level_1
1,Jan 1
2,Jan 2
3,Jan 3
4,Jan 4
5,Jan 5
...,...
361,Dec 27
362,Dec 28
363,Dec 29
364,Dec 30


In [8]:
def fill_all_days(df):
    df = df.set_index('Days').merge(
        all_days,
        left_index=True,
        right_index=True,
        how='right',
    )
    years = df.Year.dropna().unique()
    if len(years) > 1:
        raise ValueError(f"Years: {years}")
    [year] = years
    if year == cur_rundate.year:
        df = df[df.index < rundate_ytd_days]
    df = df.drop(columns='Year')
    df['YTD Deaths'] = df['YTD Deaths'].ffill().fillna(0).astype(int)
    return df    

In [9]:
ytds = crashes[['dt', 'FATALITIES']].copy()
ytds['Year'] = ytds.dt.dt.year
ytds['Days'] = ytds.dt.apply(normalized_ytd_days)
ytds = (
    ytds
    .groupby('Year', group_keys=False)
    .apply(lambda df: (
        df.assign(**{
            'YTD Deaths': df.FATALITIES.cumsum().astype(int)
        })
    ))
)
ytds = (
    ytds[['Year', 'Days', 'YTD Deaths']]
    .groupby(['Year', 'Days'])
    .max()
    .reset_index()
)

ytds = ytds.groupby('Year').apply(fill_all_days).reset_index()
ytds

Unnamed: 0,Year,Days,YTD Deaths,Text
0,2008,1,5,Jan 1
1,2008,2,6,Jan 2
2,2008,3,8,Jan 3
3,2008,4,10,Jan 4
4,2008,5,12,Jan 5
...,...,...,...,...
5865,2024,26,37,Jan 26
5866,2024,27,37,Jan 27
5867,2024,28,39,Jan 28
5868,2024,29,39,Jan 29


### Find a projected total from this time last year
Crashes sometimes reach the NJSP data after a variable delay; we'll benchmark the current YTD total against what NJSP was reporting on the same date a year prior.

This repo has snapshotted NJSP's data most days since April 2022, so we can always look back and see what things looked like 1 year ago.

In [10]:
prv_year = cur_rundate.year - 1
prv_rundate = f'{prv_year}-{cur_rundate.strftime("%m-%d")}'
print(f'Searching for projected totals ca. {prv_rundate}')

Searching for projected totals ca. 2023-01-31


In [11]:
RUNDATE_RELPATH = relpath(RUNDATE_PATH, ROOT_DIR)
PROJECTED_TOTALS_RELPATH = relpath(PROJECTED_TOTALS_PATH, ROOT_DIR)

In [12]:
def get_projs_rundate(commit):
    tree = commit.tree
    sha = commit.hexsha
    pt = tree[PROJECTED_TOTALS_RELPATH]
    pto = json.load(pt.data_stream)
    rd = tree[RUNDATE_RELPATH]
    rdo = json.load(rd.data_stream)
    return { 'sha': sha, **rdo, **pto, }

Iterate through Git commit history, looking for the oldest commit that's at least as far into last year as we currently are into the present year:

In [13]:
%%time
prv_prd = None
repo = Repo()
commits = repo.iter_commits()
shas = []
while True:
    try:
        commit = next(commits)
    except StopIteration:
        raise RuntimeError(f"Ran out of commits after {len(shas)}: {','.join(shas)}")
    shas.append(commit.hexsha[:7])
    prd = get_projs_rundate(commit)
    commit_rundate = prd["rundate"]
    if commit_rundate < prv_rundate:
        err(f'Found previous rundate {commit_rundate} < {prv_rundate}; breaking')
        break
    prv_prd = prd
prv_prd

CPU times: user 566 ms, sys: 58.5 ms, total: 625 ms
Wall time: 769 ms


Found previous rundate 2023-01-30 10:00:08 < 2023-01-31; breaking


{'sha': '99b623b3210e79f6698c1cd109221852b026d64e',
 'rundate': '2023-01-31 10:00:04',
 '2008': {'Drivers': 0,
  'Pedestrians': 0,
  'Passengers': 0,
  'Cyclists': 0,
  'Unknown': 558,
  'Missing': 32,
  'Total': 590,
  'Projected': 0,
  'Projected Total': 590},
 '2009': {'Drivers': 0,
  'Pedestrians': 0,
  'Passengers': 0,
  'Cyclists': 0,
  'Unknown': 546,
  'Missing': 38,
  'Total': 584,
  'Projected': 0,
  'Projected Total': 584},
 '2010': {'Drivers': 0,
  'Pedestrians': 0,
  'Passengers': 0,
  'Cyclists': 0,
  'Unknown': 533,
  'Missing': 23,
  'Total': 556,
  'Projected': 0,
  'Projected Total': 556},
 '2011': {'Drivers': 0,
  'Pedestrians': 0,
  'Passengers': 0,
  'Cyclists': 0,
  'Unknown': 594,
  'Missing': 33,
  'Total': 627,
  'Projected': 0,
  'Projected Total': 627},
 '2012': {'Drivers': 0,
  'Pedestrians': 0,
  'Passengers': 0,
  'Cyclists': 0,
  'Unknown': 559,
  'Missing': 30,
  'Total': 589,
  'Projected': 0,
  'Projected Total': 589},
 '2013': {'Drivers': 0,
  'Pedest

In [14]:
prv_total = prv_prd[f'{prv_year}']['Total']
prv_rundate = to_dt(prv_prd['rundate'])
print(f'As of {prv_rundate}, NJSP was reporting {prv_total} YTD deaths')

As of 2023-01-31 10:00:04, NJSP was reporting 41 YTD deaths


The commit we found may not exactly match how far into the year we currently are; scale the reported previous-year YTD deaths to match our current YTD fraction:

In [15]:
cur_year_frac = (cur_rundate - cur_year_dt) / (nxt_year_dt - cur_year_dt)
prv_year_frac = (prv_rundate - prv_year_dt) / (cur_year_dt - prv_year_dt)
cur_roy_frac = 1 - cur_year_frac
print('%.1f%% through the year, %.1f%% remaining' % (cur_year_frac * 100, cur_roy_frac * 100))

8.3% through the year, 91.7% remaining


In [16]:
prv_ytd_adj = prv_total * cur_year_frac / prv_year_frac
print(f'First %.3f%% of {prv_year} ({prv_rundate}) had {prv_total} deaths' % (100 * prv_year_frac))
print(f'Scaling to compare to %.3f%% of {cur_year} ({cur_rundate}) → {prv_ytd_adj}' % (100 * cur_year_frac))

First 8.333% of 2023 (2023-01-31 10:00:04) had 41 deaths
Scaling to compare to 8.311% of 2024 (2024-01-31 10:00:02) → 40.88794702494111


### Estimate end-of-year death toll
To estimate deaths in the remainder of the current year, we interpolate between two models:
1. The current year's proportion of deaths (relative to the prior year) will be sustained for the rest of the year, vs.
2. The rest of the current year is expected to "revert to the mean" and match the corresponding portion of the prior year.

As a simple example, imagine Alice and Bob each flip a coin 10 times, Alice gets 4 heads, and Bob gets 6. Perhaps Bob's coin is 50% more likely to show heads than Alice's, or they just differed due to random chance and a small initial sample size. If they continue flipping, and Bob continues getting 50% more heads, the former becomes increasingly likely.

Below, we weight the two models ("YTD proportion is expected to continue" vs. "rest of year will match previous year") according to what fraction of the current year has elapsed:
- As we get further into a given year, differences between the current and prior year are more likely to reflect real, systemic differences, that we expect to continue.
- On the other hand, near the start of a year, normal variance can cause the current and previous years to have significantly different YTD totals, but we have a prior belief that the current year will track the previous year.

In [17]:
cur_ytds = ytds[ytds.Year == cur_rundate.year]
cur_ytd_deaths = 0 if cur_ytds.empty else cur_ytds.iloc[-1]['YTD Deaths']

prv_ytd = ytds[ytds.Year == cur_rundate.year - 1]
prv_end_deaths = prv_ytd.iloc[-1]['YTD Deaths']
prv_ytd_deaths = prv_ytd_adj
prv_roy_deaths = prv_end_deaths - prv_ytd_deaths
prv_roy_ratio = prv_end_deaths / prv_ytd_deaths

projected_roy_deaths = int(prv_roy_deaths * (cur_year_frac * (cur_ytd_deaths / prv_ytd_deaths) + cur_roy_frac))
projected_records_total = cur_ytd_deaths + projected_roy_deaths
prv_ytd_ratio = cur_ytd_deaths / prv_ytd_deaths
pct_change = (prv_ytd_ratio - 1) * 100

print(f'Current YTD Deaths ({rundate_str}): {cur_ytd_deaths}')
print(f'Previous year YTD Deaths ({rundate_str}): {prv_ytd_deaths}')
print(f'Projected {cur_rundate.year} total: {projected_records_total}')
print(f'{pct_change:.1f}% change')

projected_records_total, cur_ytd_deaths, prv_ytd_deaths, prv_roy_ratio

Current YTD Deaths (2024-01-31): 40
Previous year YTD Deaths (2024-01-31): 40.88794702494111
Projected 2024 total: 619
-2.2% change


(619, 40, 40.88794702494111, 15.187849847809627)

### Plot YTD counts, for each year ≥2008

#### Color utilities

In [18]:
from utz.colors import RGB, color_interp, colors_lengthen, swatches
from nj_crashes.colors import get_colors, gridcolor, px_colors

years = totals.index.unique()
colors = get_colors(len(years))
black, red, year_colors = colors.black, colors.red, colors.year_colors
colors

{'black': '#000004', 'red': '#ba3853', 'year_colors': ['#fcffa4', '#f9e56a', '#f7ca36', '#f9ab17', '#f78e0d', '#ef721f', '#e15b31', '#d04643', '#ba3853', '#a22b60', '#882268', '#6f196c', '#55106b', '#3b0c5d', '#200c46', '#0f0626', '#000004']}

In [19]:
month_starts = [
    to_dt(f'{cur_year}-{m}').strftime('%b 1')
    for m in range(1, 13)
]
month_starts

['Jan 1',
 'Feb 1',
 'Mar 1',
 'Apr 1',
 'May 1',
 'Jun 1',
 'Jul 1',
 'Aug 1',
 'Sep 1',
 'Oct 1',
 'Nov 1',
 'Dec 1']

In [20]:
save(
    px.line(
        ytds,
        x='Text', y='YTD Deaths', color='Year',
        color_discrete_sequence=year_colors,
    ),
    xaxis=dict(
        tickmode='array',
        tickvals=month_starts,
        ticktext=month_starts,
    ),
    legend=dict(traceorder='reversed',),
    #bottom_legend=False,
    title='YTD Traffic Deaths',
    name='ytd-deaths',
    hoverx='x',
    bg='white',
    ygrid='#ccc',
    xgrid='#ccc',
    w=850,
    h=800,
);

![](www/public/plots/ytd-deaths.png)

## Plot deaths by {year, victim type}

### Group by year

In [21]:
dt = crashes.dt.dt
fatalities_per_year = crashes.FATALITIES.groupby(dt.year).sum().astype(int).rename('NJSP records')

#### Verify the reported "total deaths" stat reflects what we see in the crash records

In [22]:
totals

Unnamed: 0_level_0,accidents,injuries,fatalities
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008,555,414,590
2009,550,352,584
2010,530,366,556
2011,586,517,627
2012,553,382,589
2013,508,393,542
2014,523,345,556
2015,522,374,562
2016,570,398,602
2017,591,368,624


In [23]:
njsp_totals = totals.fatalities.rename('NJSP total')
njsp_diff = (totals.fatalities - fatalities_per_year).rename('NJSP diff')
njsp_totals = sxs(
    fatalities_per_year,
    njsp_totals,
    njsp_diff,
    round(njsp_diff / njsp_totals * 100, 1).apply(lambda pct: f'{"+" if pct >= 0 else "-"}{pct}%').rename('NJSP diff %'),
)
zero_rows = (njsp_totals['NJSP total'] == 0) & (njsp_totals.isna().any(axis=1))
years_mask = ~zero_rows
drop_years = njsp_totals[zero_rows].index.tolist()
if drop_years:
    err(f"Dropping years: {drop_years}")
    njsp_totals = njsp_totals[years_mask]
njsp_totals

Unnamed: 0,NJSP records,NJSP total,NJSP diff,NJSP diff %
2008,590,590,0,+0.0%
2009,584,584,0,+0.0%
2010,556,556,0,+0.0%
2011,627,627,0,+0.0%
2012,589,589,0,+0.0%
2013,542,542,0,+0.0%
2014,556,556,0,+0.0%
2015,562,562,0,+0.0%
2016,602,602,0,+0.0%
2017,624,624,0,+0.0%


### Group by month

In [24]:
ym = crashes.dt.apply(lambda d: d.strftime('%Y-%m')).rename('ym')
ym

0       2008-01
1       2008-01
2       2008-01
3       2008-01
4       2008-01
         ...   
9022    2024-01
9023    2024-01
9024    2024-01
9025    2024-01
9026    2024-01
Name: ym, Length: 9027, dtype: object

In [25]:
fatalities_per_month = crashes[crashes.dt < cur_month].FATALITIES.groupby(ym).sum()
fatalities_per_month

ym
2008-01    59.0
2008-02    40.0
2008-03    33.0
2008-04    50.0
2008-05    46.0
           ... 
2023-08    64.0
2023-09    52.0
2023-10    65.0
2023-11    63.0
2023-12    62.0
Name: FATALITIES, Length: 192, dtype: float64

### Rolling avg

In [26]:
rolling = fatalities_per_month.rolling(12).mean()
rolling

ym
2008-01          NaN
2008-02          NaN
2008-03          NaN
2008-04          NaN
2008-05          NaN
             ...    
2023-08    50.916667
2023-09    49.583333
2023-10    50.250000
2023-11    50.750000
2023-12    51.750000
Name: FATALITIES, Length: 192, dtype: float64

In [27]:
mos = (
    sxs(
        dt.year.rename('year'),
        dt.month.rename('month'),
        crashes.FATALITIES,
    )
    .groupby(['year', 'month']).sum()
)
mos

Unnamed: 0_level_0,Unnamed: 1_level_0,FATALITIES
year,month,Unnamed: 2_level_1
2008,1,59.0
2008,2,40.0
2008,3,33.0
2008,4,50.0
2008,5,46.0
...,...,...
2023,9,52.0
2023,10,65.0
2023,11,63.0
2023,12,62.0


In [28]:
pivoted = mos.reset_index().sort_values(['month', 'year'])
pivoted = pivoted[pivoted.apply(lambda r: to_dt('%d-%02d' % (r.year, r.month)).tz_localize(cur_month_dt.tz) < cur_month_dt, axis=1)]
pivoted

Unnamed: 0,year,month,FATALITIES
0,2008,1,59.0
12,2009,1,57.0
24,2010,1,37.0
36,2011,1,36.0
48,2012,1,52.0
...,...,...,...
143,2019,12,51.0
155,2020,12,47.0
167,2021,12,61.0
179,2022,12,50.0


In [29]:
by_month = crashes.FATALITIES.groupby([dt.year, dt.month]).sum()
by_month

dt    dt
2008  1     59.0
      2     40.0
      3     33.0
      4     50.0
      5     46.0
            ... 
2023  9     52.0
      10    65.0
      11    63.0
      12    62.0
2024  1     40.0
Name: FATALITIES, Length: 193, dtype: float64

### Break out victim "types"

Check victim "type" subtotals vs. total:

In [30]:
fatal_totals = sxs(*[crashes[f'FATAL_{t}'].fillna(0) for t in 'DTPB']).sum(axis=1)
sxs(crashes.dt, (crashes.FATALITIES - fatal_totals).rename('diff')).groupby(dt.year)['diff'].sum()

dt
2008    590.0
2009    584.0
2010    556.0
2011    627.0
2012    589.0
2013    542.0
2014    556.0
2015    562.0
2016    602.0
2017    624.0
2018    563.0
2019    558.0
2020      0.0
2021      0.0
2022      0.0
2023      0.0
2024      0.0
Name: diff, dtype: float64

Cross-reference with annual totals, populate "unknown" subtotal:

In [31]:
base_type_cols_map = {
    'FATAL_D': 'driver',
    'FATAL_T': 'pedestrian',
    'FATAL_P': 'passenger',
    'FATAL_B': 'cyclist',
}
base_type_cols = list(base_type_cols_map.values())

In [32]:
from njsp.paths import ANNUAL_SUMMARIES_YT_CSV

In [33]:
year_stats = read_csv(ANNUAL_SUMMARIES_YT_CSV).astype(int).set_index('year')
year_stats

Unnamed: 0_level_0,driver,passenger,cyclist,pedestrian,crashes
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008,320,112,20,138,555
2009,315,98,14,157,550
2010,303,99,13,141,530
2011,362,105,17,143,586
2012,309,103,14,163,553
2013,304,92,14,132,508
2014,295,80,11,170,523
2015,276,96,17,173,522
2016,330,89,17,166,570
2017,339,85,17,183,591


In [34]:
year_type_sums = (
    sxs(
        crashes.dt,
        crashes.rename(columns=base_type_cols_map)[base_type_cols].fillna(0)
    )
    .groupby(dt.year.rename('year'))
    .sum(numeric_only=True)
    .astype(int)
)
# Patch in year-types.csv values for [2008, 2020]
year_type_sums.loc[range(2008, 2020)] = year_stats.loc[range(2008, 2020), base_type_cols]
year_type_sums

Unnamed: 0_level_0,driver,pedestrian,passenger,cyclist
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008,320,138,112,20
2009,315,157,98,14
2010,303,141,99,13
2011,362,143,105,17
2012,309,163,103,14
2013,304,132,92,14
2014,295,170,80,11
2015,276,173,96,17
2016,330,166,89,17
2017,339,183,85,17


In [35]:
year_sums = year_type_sums.sum(axis=1).rename('sum')
year_totals = totals.fatalities.rename('total')
missing = njsp_totals['NJSP diff'].rename('missing')
unknown = (year_totals - year_sums - missing).rename('unknown')

type_cols_map = { **base_type_cols_map, 'FATAL_U': 'unknown' }
type_cols = list(type_cols_map.values())

year_types = (
    sxs(
        year_type_sums,
        year_sums,
        year_totals,
        unknown,
        missing,
    )
    [ type_cols + [ 'missing', 'total', ] ]
)[years_mask]
year_types

Unnamed: 0,driver,pedestrian,passenger,cyclist,unknown,missing,total
2008,320,138,112,20,0,0,590
2009,315,157,98,14,0,0,584
2010,303,141,99,13,0,0,556
2011,362,143,105,17,0,0,627
2012,309,163,103,14,0,0,589
2013,304,132,92,14,0,0,542
2014,295,170,80,11,0,0,556
2015,276,173,96,17,0,0,562
2016,330,166,89,17,0,0,602
2017,339,183,85,17,0,0,624


In [36]:
assert (missing == 0).all()
assert (unknown == 0).all()

In [37]:
total_errors = sxs(year_types.total, njsp_totals['NJSP total'])[year_types.total != njsp_totals['NJSP total']]
assert total_errors.empty, total_errors

In [38]:
year_types = year_types.drop(columns=['unknown', 'missing'])

compute_projected = not drop_years
if compute_projected:
    year_types['projected'] = 0
    year_types.loc[cur_year, 'projected'] = projected_roy_deaths
    year_types['projected total'] = year_types.total + year_types.projected
else:
    err(f"Skipping projections due to empty years: {drop_years}")

year_types

Unnamed: 0,driver,pedestrian,passenger,cyclist,total,projected,projected total
2008,320,138,112,20,590,0,590
2009,315,157,98,14,584,0,584
2010,303,141,99,13,556,0,556
2011,362,143,105,17,627,0,627
2012,309,163,103,14,589,0,589
2013,304,132,92,14,542,0,542
2014,295,170,80,11,556,0,556
2015,276,173,96,17,562,0,562
2016,330,166,89,17,602,0,602
2017,339,183,85,17,624,0,624


### Save `projected_totals.json`
Build `dict` of {year,type} totals (including a projected total for the current year), and write to a path accessible by the webapp:

In [39]:
base_type_cols

['driver', 'pedestrian', 'passenger', 'cyclist']

In [40]:
projected = year_types.dropna().rename(columns={ c: f'{c}s' for c in base_type_cols })
projected.columns = projected.columns.str.title()
projected_dict = projected.to_dict('index')
projected_dict

{2008: {'Drivers': 320,
  'Pedestrians': 138,
  'Passengers': 112,
  'Cyclists': 20,
  'Total': 590,
  'Projected': 0,
  'Projected Total': 590},
 2009: {'Drivers': 315,
  'Pedestrians': 157,
  'Passengers': 98,
  'Cyclists': 14,
  'Total': 584,
  'Projected': 0,
  'Projected Total': 584},
 2010: {'Drivers': 303,
  'Pedestrians': 141,
  'Passengers': 99,
  'Cyclists': 13,
  'Total': 556,
  'Projected': 0,
  'Projected Total': 556},
 2011: {'Drivers': 362,
  'Pedestrians': 143,
  'Passengers': 105,
  'Cyclists': 17,
  'Total': 627,
  'Projected': 0,
  'Projected Total': 627},
 2012: {'Drivers': 309,
  'Pedestrians': 163,
  'Passengers': 103,
  'Cyclists': 14,
  'Total': 589,
  'Projected': 0,
  'Projected Total': 589},
 2013: {'Drivers': 304,
  'Pedestrians': 132,
  'Passengers': 92,
  'Cyclists': 14,
  'Total': 542,
  'Projected': 0,
  'Projected Total': 542},
 2014: {'Drivers': 295,
  'Pedestrians': 170,
  'Passengers': 80,
  'Cyclists': 11,
  'Total': 556,
  'Projected': 0,
  'Projec

In [41]:
with open(f'{PLOTS_DIR}/projected_totals.json', 'w') as f:
    json.dump(projected_dict, f, indent=4)

## Update {year,type,county} stats

In [42]:
from njsp.paths import ANNUAL_SUMMARIES_YTC_CSV

Load {year,type,county} subtotals from annual summary PDFs (see [NJSP summary PDFs.ipynb](data/njsp/annual-summaries/NJSP%20summary%20PDFs.ipynb)):

In [43]:
ytc0 = read_csv(ANNUAL_SUMMARIES_YTC_CSV).set_index(['year', 'county']).astype(int)
ytc0

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,Atlantic,17,8,0,6,30
2008,Bergen,10,5,1,7,22
2008,Burlington,23,6,4,12,45
2008,Camden,25,4,0,15,42
2008,Cape May,8,3,0,0,11
...,...,...,...,...,...,...
2023,Salem,8,2,0,2,11
2023,Somerset,14,4,0,6,22
2023,Sussex,6,2,0,1,6
2023,Union,13,6,2,15,34


Generate a similar dataframe from crash records:

In [44]:
ytc1 = (
    crashes
    .assign(year=dt.year, crashes=1)
    [dt.year >= 2020]
    .rename(columns=dict(
        CNAME='county',
        **base_type_cols_map
    ))
    [['year', 'county'] + ytc0.columns.tolist()]
    .groupby(['year', 'county'])
    .sum(numeric_only=True)
    .astype(int)
)
ytc1

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020,Atlantic,26,5,0,9,38
2020,Bergen,14,9,0,20,38
2020,Burlington,26,4,3,9,40
2020,Camden,19,5,1,13,36
2020,Cape May,5,0,1,3,8
2020,Cumberland,14,5,0,5,22
2020,Essex,16,12,3,14,39
2020,Gloucester,21,5,2,7,33
2020,Hudson,11,1,1,11,24
2020,Hunterdon,7,2,0,3,12


Verify they match (for years ≥2020, where they overlap):

In [45]:
m = ytc0.merge(ytc1, left_index=True, right_index=True)
m.columns = pd.MultiIndex.from_tuples([ (c[-1], c[:-1]) for c in m.columns ])
diffs = m['x'] != m['y']
has_diffs = diffs.any().any()
if has_diffs:
    xd = m['x'].loc[diffs.any(axis=1), diffs.any()]
    xd.columns = pd.MultiIndex.from_tuples([ ('x', c) for c in xd.columns ])
    yd = m['y'].loc[diffs.any(axis=1), diffs.any()]
    yd.columns = pd.MultiIndex.from_tuples([ ('y', c) for c in yd.columns ])
    diffs = sxs(xd, yd)
else:
    diffs = None
diffs

In [46]:
assert not has_diffs, diffs

Combine:

In [47]:
y0 = ytc0.index.levels[0]
y1 = ytc1.index.levels[0]
ytc = pd.concat([
    ytc0.drop(index=y0[y0.isin(y1)], level=0),
    ytc1,
])
ytc

Unnamed: 0_level_0,Unnamed: 1_level_0,driver,passenger,cyclist,pedestrian,crashes
year,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,Atlantic,17,8,0,6,30
2008,Bergen,10,5,1,7,22
2008,Burlington,23,6,4,12,45
2008,Camden,25,4,0,15,42
2008,Cape May,8,3,0,0,11
...,...,...,...,...,...,...
2024,Middlesex,1,0,0,4,5
2024,Monmouth,1,3,0,2,5
2024,Ocean,2,0,0,4,6
2024,Passaic,0,0,0,1,1


Export:

In [48]:
from njsp.paths import YTC_CSV

In [49]:
ytc.to_csv(YTC_CSV)

## Fatalities per year (by type)

In [50]:
ytc = colors_lengthen(px_colors, 7)
print(' '.join(ytc))
swatches(ytc)

#000004 #320c56 #781c6d #ba3853 #ed6925 #f9b621 #fcffa4


<span style="font-family: monospace">#000004 <span style="color: #000004">██████</span></span> <span style="font-family: monospace">#320c56 <span style="color: #320c56">██████</span></span> <span style="font-family: monospace">#781c6d <span style="color: #781c6d">██████</span></span> <span style="font-family: monospace">#ba3853 <span style="color: #ba3853">██████</span></span> <span style="font-family: monospace">#ed6925 <span style="color: #ed6925">██████</span></span> <span style="font-family: monospace">#f9b621 <span style="color: #f9b621">██████</span></span> <span style="font-family: monospace">#fcffa4 <span style="color: #fcffa4">██████</span></span>

In [51]:
type_cols = [
    'cyclist',
    'driver',
    'pedestrian',
    'passenger',
    *(['projected'] if compute_projected else []),
]
type_cols_map = {
    c: f'{c[0].upper()}{c[1:]}{"s" if c != "projected" else ""}'
    for c in type_cols
}
type_cols_map

{'cyclist': 'Cyclists',
 'driver': 'Drivers',
 'pedestrian': 'Pedestrians',
 'passenger': 'Passengers',
 'projected': 'Projected'}

In [52]:
fig = (
    px.bar(
        year_types[type_cols].rename(columns=type_cols_map).replace(0, nan),
        barmode='stack',
        color_discrete_sequence=ytc[1:],
        text_auto='%d',
    )
    .update_yaxes(
        gridcolor=gridcolor,
        dtick=50,
    )
)
for year, projected_total in year_types['projected total'].to_dict().items():
    fig.add_annotation(
        x=year, y=projected_total,
        text=projected_total,
        showarrow=False,
        yshift=10,
    )
save(
    fig,
    title=f'NJ Traffic Deaths per Year (by victim type)',
    name='fatalities_per_year_by_type',
    hoverx=True,
    w=1600,
    h=800,
);

![](www/public/plots/fatalities_per_year_by_type.png)

## Fatalities per month (by victim type)

In [53]:
month_types = (
    sxs(
        crashes.dt,
        crashes.rename(columns=base_type_cols_map)[base_type_cols].fillna(0)
    )
    [ dt.year >= 2020 ]
    .groupby([
        dt.year.rename('year'),
        dt.month.rename('month'),
    ])
    [base_type_cols]
    .sum()
    .astype(int)
)

month_types = month_types.reset_index()
month_types['dt'] = (
    month_types
    [['year', 'month']]
    .apply(lambda r: '%04d-%02d' % (r['year'], r['month']), axis=1)
)
month_types = month_types.set_index('dt').drop(columns=['year', 'month'])
month_types

Unnamed: 0_level_0,driver,pedestrian,passenger,cyclist
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01,21,18,8,2
2020-02,15,17,2,0
2020-03,11,16,8,1
2020-04,17,8,2,1
2020-05,28,13,9,2
2020-06,30,8,9,0
2020-07,30,19,8,5
2020-08,31,8,14,1
2020-09,31,21,5,0
2020-10,33,17,5,3


In [54]:
type_colors = colors_lengthen(px_colors, 7)

fig = px.line(
    month_types.rename(columns=type_cols_map).loc[to_dt(month_types.index) < cur_month],
    labels={'variable': '',},
    color_discrete_sequence=type_colors,
)
fig.update_traces(line=dict(width=3))
save(
    fig,
    title='NJ Traffic Deaths per Month (by victim type)',
    name='fatalities_per_month_by_type',
    hoverx=True,
    xgrid=gridcolor,
    xaxis=dict(
        tickformat="%b '%y",
    ),
    w=800,
);

![](www/public/plots/fatalities_per_month_by_type.png)

## Fatalities per month

In [55]:
fig = go.Figure()
fig.add_trace(go.Bar(x=fatalities_per_month.index, y=fatalities_per_month.values, name='Fatalities', marker_color=red))
fig.add_trace(go.Scatter(x=rolling.index, y=rolling.apply(partial(round, ndigits=1)), name='12mo avg', line={'width': 4, 'color': black, }))
fig.update_yaxes(gridcolor=gridcolor)
save(
    fig,
    title='NJ Traffic Deaths per Month',
    name='fatalities_per_month',
    hoverx=True,
    w=1200, h=600,
);

![](www/public/plots/fatalities_per_month.png)

In [56]:
month_names = [ to_dt('2022-%02d' % i).strftime('%b') for i in range(1, 13) ]
print(' '.join(month_names))

Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec


In [57]:
fig = px.bar(
    x = pivoted.month,
    y = pivoted.FATALITIES,
    color = pivoted.year.astype(str),
    color_discrete_sequence=year_colors,
    labels=dict(color='', x='', y='',),
    barmode='group',
).update_yaxes(
    gridcolor=gridcolor,
)
save(
    fig,
    title='NJ Traffic Deaths, by Month',
    name='fatalities_by_month_bars',
    legend=dict(traceorder='reversed'),
    xaxis=dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = month_names,
    ),
    hoverx=True,
    w=1200, h=700,
);

![](www/public/plots/fatalities_by_month_bars.png)

In [58]:
fig = px.line(
    x = pivoted.month,
    y = pivoted.FATALITIES,
    color = pivoted.year,
    color_discrete_sequence=year_colors,
    labels={ 'color': '', 'x': '', 'y': '' },
).update_yaxes(
    gridcolor=gridcolor,
)
save(
    fig,
    title='NJ Traffic Deaths by Month',
    name='fatalities_by_month_lines',
    xaxis=dict(
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext = month_names,
    ),
    legend=dict(traceorder='reversed'),
    hoverx=True,
    w=1200, h=700,
);

![](www/public/plots/fatalities_by_month_lines.png)