# Compute crash stats

Crash records come with some denormalized summary stats:
- "Total killed" (`tk`)
- "Total injured" (`ti`)
- "Pedestrians killed" (`pk`)
- "Pedestrians injured" (`pi`')
- "Total vehicles involved" (`tv`)
- "Crash severity" (`severity`: `f` iff `tk > 0`, else `i` if `ti > 0`, else `p`)

These don't always match the normalized stats we can obtain by joining `crashes` against the `pedestrians`, `occupants`, and `vehicles` tables.

Here we overwrite `crashes.{parquet,db}` to include the normalized stats inferred from the other tables. We also perform a few checks to identify sources of discrepancies. In most cases, it seems like the "totals" listed on the crash record simply misplace pedestrians for occupants, or vice versa.

In [1]:
from utz import *
from njdot import crashes, vehicles, occupants, pedestrians
from nj_crashes.utils import sql

In [2]:
%%time
c = crashes.load()
c

Reading njdot/data/crashes.parquet


CPU times: user 5.15 s, sys: 1.5 s, total: 6.65 s
Wall time: 4.55 s


Unnamed: 0_level_0,year,cc,mc,case,dt,mc_dot,pdc,pdn,station,tk,ti,pk,pi,severity,Intersection,alcohol,hazmat,crash_type,tv,road,road_direction,route,Route Suffix,sri,mp,road_system,road_character,road_surface,surface_condition,light_condition,env_condition,road_divided,ttcz,cross_street_distance,Unit Of Measurement,Direction From Cross Street,cross_street,Is Ramp,ramp_route,Ramp To/From Route Direction,speed_limit,speed_limit_cross,olat,olon,cell_phone,Other Property Damage,Reporting Badge No.,occ,omc,reason,ilon,ilat,icc,imc,horizontal_alignment,road_grade,first_harmful_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,2001,1,1,#2001-17846,2001-12-21 18:34:00,1,01,Absecon City,MUNICIPAL COMP?,0,0,0,0,p,B,False,False,3,2,CALDERON AVENUE,,,,,,7,2,2,1,6,1,5,01,100,FE,N,RT 30,,,,25,,,,False,NEVER SAW V-1 MINOR DAMAGE - NO INJURIES REPOR...,830,,,No MP,,,,,,,
1,2001,1,1,01-00029,2001-01-01 09:30:00,1,1,Absecon,,0,0,0,0,p,B,False,False,6,2,RITZ DRIVE,,,,,,7,1,2,3,6,1,5,01,,,,,,,,25,,,,False,,836,,,No MP,,,,,,,
2,2001,1,1,01-004615,2001-04-10 14:44:00,1,1,Absecon,,0,4,0,0,i,I,False,False,3,2,MORTON AVENUE,,,,,,7,1,2,1,1,1,5,01,,AT,,NEW YORK AVENUE,,,,25,,,,False,,836,,,No MP,,,,,,,
3,2001,1,1,01-004880,2001-04-15 13:56:00,1,1,Absecon,,0,1,0,0,i,B,False,False,1,2,RT 30,,30,,00000030__,51.099998,2,1,2,1,1,1,5,01,,,,,,,,45,,,,False,WITNESS-PETRIA GIBSON AND ANGELO HERSH,886,,,,-74.512308,39.432705,1,1,,,
4,2001,1,1,01-004912,2001-04-16 10:29:00,1,1,Absecon,,0,0,0,0,p,I,False,False,1,2,CR 651,,651,,,,5,2,2,2,1,2,5,01,,AT,,CR 630,,,,35,,,,False,*BETH VEHICLES-RELIANCE INSURANCE CO.(215)864-...,836,,,No MP,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319789,2022,21,23,220983099,2022-12-15 15:38:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,True,False,11,1,WARREN COUNTY 620,S,620,,21000620__,0.300000,5,,2,3,1,3,4,01,,,,,,,,50,,40.812200,-75.049400,False,DRIVER HIT AND DAMAGED A TELEPHONE POLE.,8927,21,23,,-75.075743,40.811535,21,23,1,4,52
6319790,2022,21,23,221002451,2022-12-22 17:36:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,12,1,ROUTE 519,N,519,,00000519__,37.599998,5,,2,1,5,1,5,01,,,,,,,,50,,40.793436,-75.084177,False,,8726,21,23,,-75.084237,40.793801,21,23,1,4,24
6319791,2022,21,23,221003933,2022-12-23 17:44:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,11,1,US 46,W,46,,00000046__,11.500000,2,,2,1,5,1,5,01,,,,,,,,50,,40.830500,-74.976274,False,,8704,21,23,,-74.975193,40.830019,21,23,3,4,60
6319792,2022,21,23,221012812,2022-12-28 13:32:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,1,2,NJ 31,E,31,,00000031__,48.930000,2,,2,1,1,1,5,01,,,,,T,US 46,SB,50,,40.832468,-75.001225,False,,8777,21,23,MP didn't geocode,,,,,3,6,26


In [3]:
p = pedestrians.load()
p

Reading njdot/data/pedestrians.parquet


Unnamed: 0,id,crash_id,pn,condition,city,state,zip,dob,age,sex,alc_test_given,alc_test_type,alc_test_results,charge1,summons1,traffic_controls,cir1,cir2,dir,act,inj_loc,inj_type,med_refused,safety_used,hospital,status1,cyclist,other,charge2,summons2,charge3,summons3,charge4,summons4,status2
0,0,7,1,3,ABSECON,NJ,08201,07/28/1990,10,M,,,,,,,,,,41,1,5,,,,1,False,False,,,,,,,
1,1,48,31,3,ABSECON,NJ,08201,01/16/1967,34,M,,,,,,,,,,,1,4,,,,,True,False,,,,,,,
2,2,76,1,2,EGG HARBOR TWP.,NJ,08234,11/16/1952,48,F,N,,,,,,,,,46,12,3,,,,1,False,False,,,,,,,
3,3,114,1,4,GALLOWAY TWP.,NJ,08205,03/31/1959,42,F,N,,,SUBPOENA,,,,,,49,7,,,,,1,False,False,,,,,,,
4,4,236,1,4,ABSECON,NJ,08201,04/18/1953,48,M,N,,,,,,,,,43,1,4,,,,1,False,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185444,185444,6319317,1,4,PHILLIPSBURG,NJ,08865,,31,M,N,,,,,11,75,72,1,42,0,0,2,,7102,7,False,False,,,,,,,6
185445,185445,6319399,1,5,BELVIDERE,NJ,07823,,16,M,N,,,,,,78,,,42,8,8,1,,,1,False,False,,,,,,,
185446,185446,6319407,31,2,WASHINGTON,NJ,07882,,12,M,N,,,,,11,25,,3,1,11,7,2,1,7102,1,True,False,,,,,,,
185447,185447,6319412,1,1,WASHINGTON,NJ,07882,,55,M,N,,,,,4,78,,1,46,1,3,2,1,6404,7,False,False,,,,,,,


In [4]:
o = occupants.load()
o

Reading njdot/data/occupants.parquet


Unnamed: 0,id,crash_id,vehicle_id,on,condition,pos,eject,age,sex,inj_loc,inj_type,med_refused,safety_avail,safety_used,airbag,hospital
0,0,0,0,1,,1,1,38,M,,,,4,4,,
1,1,0,1,2,,1,1,63,F,,,,4,4,,
2,2,1,2,1,,,,,,,,,,,,
3,3,2,4,1,3,1,1,29,F,6,8,,4,4,,
4,4,2,4,2,3,3,1,7,M,8,5,,4,4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14300194,14300194,6319792,11907736,2,5,3,1,48,F,,,,11,4,,
14300195,14300195,6319792,11907736,3,5,3,1,48,F,,,,11,4,,
14300196,14300196,6319792,11907737,4,5,1,1,24,F,,,,11,4,,
14300197,14300197,6319793,11907738,1,5,1,1,77,M,,,,11,4,,


In [5]:
v = vehicles.load()
v

Reading njdot/data/vehicles.parquet


Unnamed: 0,id,crash_id,vn,ins_co,owner_state,make,model,color,vy,state,rm_by,impact_loc,damage_loc,type,use,cargo_type,cir1,cir2,dir,act,ev1,ev2,ev3,ev4,oversize,hit_run,departure,damage,ev
0,0,0,1,426,NJ,NISSAN MAXIMA,,BUR,1991,NJ,,8,7,1,,,25,,1,3,26,,,,,False,1,,
1,1,0,2,989,NJ,LINCOLN TOWNCAR,,BK,1996,NJ,2,12,12,6,,0,4,,2,3,26,,,,0,False,1,,
2,2,1,1,962,NJ,TOYOTA 4DR,,GRN,1997,NJ,1,11,,1,,,25,,3,10,28,,,,,False,1,,
3,3,1,2,,,,,,0,,,0,0,5,,0,2,,1,1,26,,,,0,False,0,,
4,4,2,1,85,NJ,CHEVY CORSICA,,PUR,1996,NJ,3,8,15,1,,,25,,3,1,26,1,,,,False,6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11907735,11907735,6319791,1,413,NJ,MAZDA,TRI,GY,2006,NJ,2,12,12,4,01,,52,,4,1,60,,,,,False,0,4,60
11907736,11907736,6319792,1,102,NJ,TOYOTA,TUNDRA,GN,2022,NJ,2,6,6,5,01,,29,,2,12,26,,,,,False,0,2,26
11907737,11907737,6319792,2,100,NJ,HONDA,HRV,GY,2020,NJ,1,12,12,4,01,,29,,2,12,26,,,,,False,0,2,26
11907738,11907738,6319793,1,426,NJ,CHEVROLET,TRA,RD,2012,NJ,3,11,11,4,01,,4,,3,6,26,,,,,False,0,4,26


In [6]:
%%time
cok = (o[o.condition == 1].crash_id.value_counts()).rename('cok')
coi = (o[(o.condition >= 2) & (o.condition <= 4)].crash_id.value_counts()).rename('coi')
coki = sxs(cok, coi).fillna(0).astype(int)
coki

CPU times: user 362 ms, sys: 255 ms, total: 617 ms
Wall time: 738 ms


Unnamed: 0_level_0,cok,coi
crash_id,Unnamed: 1_level_1,Unnamed: 2_level_1
940642,6,0
2553102,5,3
6088880,5,6
1287068,5,0
5677035,5,0
...,...,...
2370679,0,1
2370684,0,1
2370693,0,1
2370703,0,1


In [7]:
%%time
cpk = (p[p.condition == 1].crash_id.value_counts()).rename('cpk')
cpi = (p[(p.condition >= 2) & (p.condition <= 4)].crash_id.value_counts()).rename('cpi')
cpki = sxs(cpk, cpi).fillna(0).astype(int)
cpki

CPU times: user 43.3 ms, sys: 15.5 ms, total: 58.8 ms
Wall time: 64.6 ms


Unnamed: 0_level_0,cpk,cpi
crash_id,Unnamed: 1_level_1,Unnamed: 2_level_1
355837,3,0
3994086,3,0
6121016,3,1
383116,3,0
4379203,2,0
...,...,...
1873542,0,1
1873505,0,1
1873484,0,1
1873466,0,1


In [8]:
%%time
ctv = (v.crash_id.value_counts()).rename('ctv')
ctv

CPU times: user 214 ms, sys: 89.7 ms, total: 304 ms
Wall time: 323 ms


crash_id
6246719    59
6242692    41
5201624    37
844899     26
1426238    25
           ..
5123369     1
3701968     1
5123364     1
3701970     1
3160113     1
Name: ctv, Length: 6319034, dtype: int64

In [9]:
ok = (c.tk - c.pk).rename('ok')
oi = (c.ti - c.pi).rename('oi')

In [10]:
cc = sxs(c.pk, cpk, c.pi, cpi, ok, cok, oi, coi, c.tv, ctv).fillna(0).astype(int)
cc

Unnamed: 0,pk,cpk,pi,cpi,ok,cok,oi,coi,tv,ctv
0,0,0,0,0,0,0,0,0,2,2
1,0,0,0,0,0,0,0,0,2,2
2,0,0,0,0,0,0,4,4,2,2
3,0,0,0,0,0,0,1,1,2,2
4,0,0,0,0,0,0,0,0,2,2
...,...,...,...,...,...,...,...,...,...,...
6319789,0,0,0,0,0,0,0,0,1,1
6319790,0,0,0,0,0,0,0,0,1,1
6319791,0,0,0,0,0,0,0,0,1,1
6319792,0,0,0,0,0,0,0,0,2,2


In [11]:
def kct(k):
    return pd.crosstab(cc[k], cc[f'c{k}']).replace(0, '')

In [12]:
kct('pk')

cpk,0,1,2,3
pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6315937.0,338.0,2.0,
1,7.0,3468.0,4.0,
2,,,34.0,
3,,,,4.0


In [13]:
kct('ok')

cok,0,1,2,3,4,5,6
ok,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,2.0,,,,,,
0,6309931.0,16.0,,,,,
1,360.0,8798.0,1.0,,,,
2,1.0,1.0,569.0,,,,
3,,,,93.0,,,
4,,,,,17.0,,
5,,,,,,4.0,
6,,,,,,,1.0


In [14]:
kct('pi')

cpi,0,1,2,3,4,5,6,7,8,9,10,16
pi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,6182377.0,38672.0,340.0,14.0,1.0,2.0,,,,,,
1,12.0,95862.0,59.0,1.0,,,1.0,,,,,
2,,5.0,2155.0,3.0,,,,,,,,
3,,,,232.0,,,,,,,,
4,,,,,36.0,,,,,,,
5,,,,,,15.0,,,,,,
6,,,,,,,2.0,,,,,
7,,,,,,,,1.0,,,,
8,,,,,,,,,1.0,,,
9,,,,,,,,,,1.0,,


In [15]:
kct('oi')

coi,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,37,40,42,44,56
oi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
-1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,4974171.0,72.0,7.0,3.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,38389.0,922119.0,56.0,10.0,4.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,367.0,463.0,268785.0,24.0,6.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,23.0,14.0,56.0,75994.0,7.0,1.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4.0,2.0,3.0,17.0,25287.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,3.0,1.0,1.0,2.0,7.0,9539.0,2.0,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,2.0,2.0,,,,2.0,2338.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,,,,,,,2.0,987.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,,1.0,,,,,,,455.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
kct('tv')

ctv,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,25,26,37,41,59
tv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0,705.0,5.0,2.0,,,,,,,,,,,,,,,,,,,,,,,
1,53.0,1214879.0,1853.0,21.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,
2,2.0,2735.0,4700548.0,1331.0,310.0,50.0,13.0,5.0,,1.0,,,,1.0,,1.0,,,,,,,,,,
3,,,,329233.0,3.0,1.0,,,,,,,,,,,,,,,,,,,,
4,,,,,54644.0,7.0,5.0,,1.0,,,,,,,,,,,,,,,,,
5,,,,,,9691.0,,,,,,,,,,,,,,,,,,,,
6,,,,,,,2440.0,,,,1.0,,,1.0,,,,,,,,,,,,
7,,,,,,,,711.0,,,,,,,,,,,,,,,,,,
8,,,,,,,,,264.0,,,,,,,,,,,,,,,,,
9,,,,,,,,,,119.0,,,,,,,,,,,,,,,,


In [17]:
pd.crosstab(c.severity == 'f', c.tk > 0)

tk,False,True
severity,Unnamed: 1_level_1,Unnamed: 2_level_1
False,6306406,26
True,45,13317


In [18]:
cols = [ 
    'pk', 'pi',
#    'ok', 'oi',
    'tv' ,
]

In [19]:
c2 = c.copy()
for k in cols:
    col = f'c{k}'
    c2[k] = cc[col].fillna(0).astype(int)
c2['tk'] = cc.pk + cc.ok
c2['ti'] = cc.pi + cc.oi
c2['severity'] = 'p'
c2.loc[c2.ti > 0, 'severity'] = 'i'
c2.loc[c2.tk > 0, 'severity'] = 'f'
c2

Unnamed: 0_level_0,year,cc,mc,case,dt,mc_dot,pdc,pdn,station,tk,ti,pk,pi,severity,Intersection,alcohol,hazmat,crash_type,tv,road,road_direction,route,Route Suffix,sri,mp,road_system,road_character,road_surface,surface_condition,light_condition,env_condition,road_divided,ttcz,cross_street_distance,Unit Of Measurement,Direction From Cross Street,cross_street,Is Ramp,ramp_route,Ramp To/From Route Direction,speed_limit,speed_limit_cross,olat,olon,cell_phone,Other Property Damage,Reporting Badge No.,occ,omc,reason,ilon,ilat,icc,imc,horizontal_alignment,road_grade,first_harmful_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,2001,1,1,#2001-17846,2001-12-21 18:34:00,1,01,Absecon City,MUNICIPAL COMP?,0,0,0,0,p,B,False,False,3,2,CALDERON AVENUE,,,,,,7,2,2,1,6,1,5,01,100,FE,N,RT 30,,,,25,,,,False,NEVER SAW V-1 MINOR DAMAGE - NO INJURIES REPOR...,830,,,No MP,,,,,,,
1,2001,1,1,01-00029,2001-01-01 09:30:00,1,1,Absecon,,0,0,0,0,p,B,False,False,6,2,RITZ DRIVE,,,,,,7,1,2,3,6,1,5,01,,,,,,,,25,,,,False,,836,,,No MP,,,,,,,
2,2001,1,1,01-004615,2001-04-10 14:44:00,1,1,Absecon,,0,4,0,0,i,I,False,False,3,2,MORTON AVENUE,,,,,,7,1,2,1,1,1,5,01,,AT,,NEW YORK AVENUE,,,,25,,,,False,,836,,,No MP,,,,,,,
3,2001,1,1,01-004880,2001-04-15 13:56:00,1,1,Absecon,,0,1,0,0,i,B,False,False,1,2,RT 30,,30,,00000030__,51.099998,2,1,2,1,1,1,5,01,,,,,,,,45,,,,False,WITNESS-PETRIA GIBSON AND ANGELO HERSH,886,,,,-74.512308,39.432705,1,1,,,
4,2001,1,1,01-004912,2001-04-16 10:29:00,1,1,Absecon,,0,0,0,0,p,I,False,False,1,2,CR 651,,651,,,,5,2,2,2,1,2,5,01,,AT,,CR 630,,,,35,,,,False,*BETH VEHICLES-RELIANCE INSURANCE CO.(215)864-...,836,,,No MP,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319789,2022,21,23,220983099,2022-12-15 15:38:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,True,False,11,1,WARREN COUNTY 620,S,620,,21000620__,0.300000,5,,2,3,1,3,4,01,,,,,,,,50,,40.812200,-75.049400,False,DRIVER HIT AND DAMAGED A TELEPHONE POLE.,8927,21,23,,-75.075743,40.811535,21,23,1,4,52
6319790,2022,21,23,221002451,2022-12-22 17:36:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,12,1,ROUTE 519,N,519,,00000519__,37.599998,5,,2,1,5,1,5,01,,,,,,,,50,,40.793436,-75.084177,False,,8726,21,23,,-75.084237,40.793801,21,23,1,4,24
6319791,2022,21,23,221003933,2022-12-23 17:44:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,11,1,US 46,W,46,,00000046__,11.500000,2,,2,1,5,1,5,01,,,,,,,,50,,40.830500,-74.976274,False,,8704,21,23,,-74.975193,40.830019,21,23,3,4,60
6319792,2022,21,23,221012812,2022-12-28 13:32:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,1,2,NJ 31,E,31,,00000031__,48.930000,2,,2,1,1,1,5,01,,,,,T,US 46,SB,50,,40.832468,-75.001225,False,,8777,21,23,MP didn't geocode,,,,,3,6,26


In [20]:
((c == c2) | (c.isna() & c2.isna())).all()

year                             True
cc                               True
mc                               True
case                             True
dt                               True
mc_dot                           True
pdc                              True
pdn                              True
station                          True
tk                               True
ti                               True
pk                              False
pi                              False
severity                        False
Intersection                     True
alcohol                          True
hazmat                           True
crash_type                       True
tv                              False
road                             True
road_direction                   True
route                            True
Route Suffix                     True
sri                              True
mp                               True
road_system                      True
road_charact

In [21]:
from njdot import CRASHES_PQT, CRASHES_DB

In [26]:
c2.index = c2.index.astype('int32')
c2 = c2.astype({ c: 'int8' for c in ['mc', 'tk', 'ti', 'pk', 'pi', 'tv'] })
c2

Unnamed: 0_level_0,year,cc,mc,case,dt,mc_dot,pdc,pdn,station,tk,ti,pk,pi,severity,Intersection,alcohol,hazmat,crash_type,tv,road,road_direction,route,Route Suffix,sri,mp,road_system,road_character,road_surface,surface_condition,light_condition,env_condition,road_divided,ttcz,cross_street_distance,Unit Of Measurement,Direction From Cross Street,cross_street,Is Ramp,ramp_route,Ramp To/From Route Direction,speed_limit,speed_limit_cross,olat,olon,cell_phone,Other Property Damage,Reporting Badge No.,occ,omc,reason,ilon,ilat,icc,imc,horizontal_alignment,road_grade,first_harmful_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
0,2001,1,1,#2001-17846,2001-12-21 18:34:00,1,01,Absecon City,MUNICIPAL COMP?,0,0,0,0,p,B,False,False,3,2,CALDERON AVENUE,,,,,,7,2,2,1,6,1,5,01,100,FE,N,RT 30,,,,25,,,,False,NEVER SAW V-1 MINOR DAMAGE - NO INJURIES REPOR...,830,,,No MP,,,,,,,
1,2001,1,1,01-00029,2001-01-01 09:30:00,1,1,Absecon,,0,0,0,0,p,B,False,False,6,2,RITZ DRIVE,,,,,,7,1,2,3,6,1,5,01,,,,,,,,25,,,,False,,836,,,No MP,,,,,,,
2,2001,1,1,01-004615,2001-04-10 14:44:00,1,1,Absecon,,0,4,0,0,i,I,False,False,3,2,MORTON AVENUE,,,,,,7,1,2,1,1,1,5,01,,AT,,NEW YORK AVENUE,,,,25,,,,False,,836,,,No MP,,,,,,,
3,2001,1,1,01-004880,2001-04-15 13:56:00,1,1,Absecon,,0,1,0,0,i,B,False,False,1,2,RT 30,,30,,00000030__,51.099998,2,1,2,1,1,1,5,01,,,,,,,,45,,,,False,WITNESS-PETRIA GIBSON AND ANGELO HERSH,886,,,,-74.512308,39.432705,1,1,,,
4,2001,1,1,01-004912,2001-04-16 10:29:00,1,1,Absecon,,0,0,0,0,p,I,False,False,1,2,CR 651,,651,,,,5,2,2,2,1,2,5,01,,AT,,CR 630,,,,35,,,,False,*BETH VEHICLES-RELIANCE INSURANCE CO.(215)864-...,836,,,No MP,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319789,2022,21,23,220983099,2022-12-15 15:38:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,True,False,11,1,WARREN COUNTY 620,S,620,,21000620__,0.300000,5,,2,3,1,3,4,01,,,,,,,,50,,40.812200,-75.049400,False,DRIVER HIT AND DAMAGED A TELEPHONE POLE.,8927,21,23,,-75.075743,40.811535,21,23,1,4,52
6319790,2022,21,23,221002451,2022-12-22 17:36:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,12,1,ROUTE 519,N,519,,00000519__,37.599998,5,,2,1,5,1,5,01,,,,,,,,50,,40.793436,-75.084177,False,,8726,21,23,,-75.084237,40.793801,21,23,1,4,24
6319791,2022,21,23,221003933,2022-12-23 17:44:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,11,1,US 46,W,46,,00000046__,11.500000,2,,2,1,5,1,5,01,,,,,,,,50,,40.830500,-74.976274,False,,8704,21,23,,-74.975193,40.830019,21,23,3,4,60
6319792,2022,21,23,221012812,2022-12-28 13:32:00,23,02,New Jersey State Police,WASHINGTON,0,0,0,0,p,B,False,False,1,2,NJ 31,E,31,,00000031__,48.930000,2,,2,1,1,1,5,01,,,,,T,US 46,SB,50,,40.832468,-75.001225,False,,8777,21,23,MP didn't geocode,,,,,3,6,26


In [28]:
c2.to_parquet(CRASHES_PQT, index=True)

In [23]:
from njdot.load import CRASH_IDXS

In [24]:
%%time
sql.write(
    c2, 'crashes', CRASHES_DB,
    idxs=crash_idxs,
    rm=True,
    replace=False,
    page_size=2**16,
)

Removing /Users/ryan/c/neighbor-ryan/nj-crashes/www/public/njdot/crashes.db
Writing 6077303 rows to /Users/ryan/c/neighbor-ryan/nj-crashes/www/public/njdot/crashes.db
Wrote DB: 1315594240 bytes
After indices: 1932619776 bytes


CPU times: user 2min 55s, sys: 1min 34s, total: 4min 29s
Wall time: 7min 43s


After setting page_size=65536 and vacuum: 1883701248 bytes


In [25]:
import boto3
s3 = boto3.client('s3')

In [26]:
s3.upload_file(CRASHES_DB, Bucket='nj-crashes', Key=f'njdot/data/{basename(CRASHES_DB)}')