In [1]:
from utz import *
from bs4 import BeautifulSoup as bs
import plotly.graph_objects as go
import plotly.express as px

### Load one year of NJSP fatal crash data

In [2]:
def parse_file(path):
    with open(path, 'r') as f:
        xml = bs(f)
    children = list(xml.children)
    assert len(children) == 3
    html = children[-1]
    assert html.name == 'html'
    fauqstats = html.body.fauqstats
    assert fauqstats.name == 'fauqstats'
    rundate = fauqstats.rundate.text
    year = fauqstats.statsyear.text
    counties = fauqstats.find_all('county', recursive=False)
    crash_counties = [ county for county in counties if county.municipality ]
    print(f'{len(counties)} "county" entries, {len(crash_counties)} containing "municipality"/crash info')
    records = []
    for county in crash_counties:
        municipality = county.municipality
        assert municipality.name == 'municipality'
        children = get_children(municipality)
        accidents = municipality.find_all('accident', recursive=False)
        if len(children) != len(accidents):
            raise ValueError(f'Found {len(children)} municipality children, but {len(accidents)} accidents: {county}. {accidents}')
        for accident in accidents:
            obj = { child.name: child.text for child in get_children(accident) }
            obj = dict(**county.attrs, **municipality.attrs, **accident.attrs, **obj, )
            records.append(obj)
    
    df = pd.DataFrame(records)
    return df

In [4]:
def get_children(tag):
    return [ child for child in tag.children if not isinstance(child, str) ]

## Load all years of NJSP data (2008-2022)

In [5]:
df = pd.concat([ parse_file(f'FAUQStats{year}.xml') for year in range(2008, 2023) ])
df

540 "county" entries, 519 containing "municipality"/crash info
528 "county" entries, 507 containing "municipality"/crash info
525 "county" entries, 504 containing "municipality"/crash info
575 "county" entries, 554 containing "municipality"/crash info
546 "county" entries, 525 containing "municipality"/crash info
498 "county" entries, 477 containing "municipality"/crash info
517 "county" entries, 496 containing "municipality"/crash info
517 "county" entries, 496 containing "municipality"/crash info
550 "county" entries, 529 containing "municipality"/crash info
580 "county" entries, 559 containing "municipality"/crash info
519 "county" entries, 498 containing "municipality"/crash info
515 "county" entries, 494 containing "municipality"/crash info
538 "county" entries, 517 containing "municipality"/crash info
646 "county" entries, 625 containing "municipality"/crash info
156 "county" entries, 135 containing "municipality"/crash info


Unnamed: 0,ccode,cname,mcode,mname,accid,date,time,highway,location,fatalities,injuries,street,fatal_d,fatal_p,fatal_t,fatal_b
0,15,Ocean,1512,Lacey Twsp,2391,12/31/2008,1340,444,State/Interstate Authority 444 S MP 72.4,1,1,,,,,
1,15,Ocean,1512,Lacey Twsp,2390,12/29/2008,1839,614,County 614 E MP 12.5,1,0,,,,,
2,18,Somerset,1814,North Plainfield Bo,2388,12/29/2008,0610,22,State Highway 22 E MP 45.25 at North Drive,1,0,,,,,
3,03,Burlington,0325,Mount Laurel Twsp,2389,12/29/2008,0337,537,County 537 E MP 12.74,1,0,,,,,
4,13,Monmouth,1319,Howell Twsp,2384,12/26/2008,1752,9,State Highway 9 S MP 105.08 at Friendship Road,1,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,20,Union,2004,Elizabeth City,11465,01/02/2022,1126,1,State Highway 1,1,,,1,0,0,0
131,08,Gloucester,0805,Franklin Twsp,11462,01/02/2022,0754,55,State Highway 55 MP 41.2,1,0,,1,0,0,0
132,12,Middlesex,1202,Cranbury Twsp,11461,01/02/2022,0543,615,County 615,1,,,0,0,1,0
133,03,Burlington,0308,Cinnaminson Twsp,11460,01/01/2022,0727,130,State Highway 130 MP 38.1,1,,,1,0,0,0


In [6]:
df['dt'] = df[['date','time']].apply(lambda r: to_dt(f'{r["date"]} {r["time"]}'), axis=1)
d = df.astype({
    'fatalities': float,
    'fatal_d': float,
    'fatal_p': float,
    'fatal_t': float,
    'fatal_b': float,
    'injuries': float,
}).drop(columns=['date', 'time'])
d

Unnamed: 0,ccode,cname,mcode,mname,accid,highway,location,fatalities,injuries,street,fatal_d,fatal_p,fatal_t,fatal_b,dt
0,15,Ocean,1512,Lacey Twsp,2391,444,State/Interstate Authority 444 S MP 72.4,1.0,1.0,,,,,,2008-12-31 13:40:00
1,15,Ocean,1512,Lacey Twsp,2390,614,County 614 E MP 12.5,1.0,0.0,,,,,,2008-12-29 18:39:00
2,18,Somerset,1814,North Plainfield Bo,2388,22,State Highway 22 E MP 45.25 at North Drive,1.0,0.0,,,,,,2008-12-29 06:10:00
3,03,Burlington,0325,Mount Laurel Twsp,2389,537,County 537 E MP 12.74,1.0,0.0,,,,,,2008-12-29 03:37:00
4,13,Monmouth,1319,Howell Twsp,2384,9,State Highway 9 S MP 105.08 at Friendship Road,1.0,0.0,,,,,,2008-12-26 17:52:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,20,Union,2004,Elizabeth City,11465,1,State Highway 1,1.0,,,1.0,0.0,0.0,0.0,2022-01-02 11:26:00
131,08,Gloucester,0805,Franklin Twsp,11462,55,State Highway 55 MP 41.2,1.0,0.0,,1.0,0.0,0.0,0.0,2022-01-02 07:54:00
132,12,Middlesex,1202,Cranbury Twsp,11461,615,County 615,1.0,,,0.0,0.0,1.0,0.0,2022-01-02 05:43:00
133,03,Burlington,0308,Cinnaminson Twsp,11460,130,State Highway 130 MP 38.1,1.0,,,1.0,0.0,0.0,0.0,2022-01-01 07:27:00


#### Save to file

In [7]:
d.to_parquet('njsp.pqt')

#### Group by year

In [10]:
fatalities_per_year = d.fatalities.groupby(d.dt.dt.year).sum()
fatalities_per_year

dt
2008    558.0
2009    546.0
2010    533.0
2011    594.0
2012    559.0
2013    513.0
2014    532.0
2015    533.0
2016    562.0
2017    595.0
2018    537.0
2019    534.0
2020    556.0
2021    660.0
2022    140.0
Name: fatalities, dtype: float64

#### Group by month

In [16]:
ym = d.dt.apply(lambda d: d.strftime('%Y-%m'))
fatalities_per_month = d.fatalities.groupby(ym).sum()
fatalities_per_month = fatalities_per_month.loc[fatalities_per_month.index < '2022-04']
fatalities_per_month

0      2008-12
1      2008-12
2      2008-12
3      2008-12
4      2008-12
        ...   
130    2022-01
131    2022-01
132    2022-01
133    2022-01
134    2022-01
Name: dt, Length: 7489, dtype: object

#### Rolling avg

In [20]:
rolling = fatalities_per_month.rolling(12).mean()
rolling

dt
2008-01          NaN
2008-02          NaN
2008-03          NaN
2008-04          NaN
2008-05          NaN
             ...    
2021-11    53.583333
2021-12    55.000000
2022-01    55.750000
2022-02    56.916667
2022-03    56.000000
Name: fatalities, Length: 171, dtype: float64

#### Fatalities per month plot

In [36]:
margin = 40

fig = go.Figure()
fig.add_trace(go.Bar(x=fatalities_per_month.index, y=fatalities_per_month.values, name='Fatalities'))
fig.add_trace(go.Scatter(x=rolling.index, y=rolling.values, name='12mo avg', line={'width': 3, 'color': 'black', }))
fig.update_layout(
    title='NJ Traffic Fatalities per Month',
    title_x=0.5,
    margin=dict(l=margin, r=margin, t=margin, b=margin),
)
fig.write_image('fatalities_per_month.png', width=1200, height=600)
fig.show()

#### Fatalities per year plot

In [38]:
fig = px.bar(
    fatalities_per_year[fatalities_per_year.index < 2022],
    labels={
        'variable': '',
        'dt': 'Year',
        'value': 'Fatalities'
    },
)
fig.update_layout(
    title='NJ Traffic Fatalities per Year',
    title_x=0.5,
    showlegend=False,
)
fig.write_image('fatalities_per_year.png')
fig