In [1]:
from utz import *
from njdot.crashes import name_renames
from njdot.data import cc2cn
from njdot.paths import CC2MC2MN
from njdot import YEARS, Data
from njdot.load import pk_astype, pk_renames

In [2]:
%%time
data = Data(types=['Accidents'], columns=['County Code', 'County Name', 'Municipality Code', 'Municipality Name'])
c = data.df(index=False)
c = c.value_counts(c.columns.tolist()).sort_index().rename('num').reset_index()
c = c.rename(columns={
    k: v for k, v in {
        'Year': 'year',
        **pk_renames,
        **name_renames,
    }.items()
    if k in c
}).astype(pk_astype)
c['cn'] = c['cn'].str.title()
c['mn'] = c['mn'].str.title()
c = c[['year'] + [k for k in c if k != 'year']]
c

CPU times: user 895 ms, sys: 172 ms, total: 1.07 s
Wall time: 834 ms


Unnamed: 0,year,cc,cn,mc,mn,num
0,2001,1,Atlantic,1,Absecon City,312
1,2002,1,Atlantic,1,Absecon City,367
2,2003,1,Atlantic,1,Absecon City,371
3,2004,1,Atlantic,1,Absecon City,322
4,2005,1,Atlantic,1,Absecon City,352
...,...,...,...,...,...,...
11742,2017,21,Warren,23,White Twp,140
11743,2018,21,Warren,23,White Twp,156
11744,2019,21,Warren,23,White Twp,141
11745,2020,21,Warren,23,White Twp,121


Helper that returns:
1. unique sets of values found in `cols`
2. rows from 1. that match all columns but the last

In [3]:
def ambiguous_mappings(df, cols):
    uniqs = (
        df
        #.reset_index()
        [cols]
        .drop_duplicates()
        .sort_values(cols)
    )
    hist = uniqs.value_counts(cols[:-1], sort=False)
    conflicts = hist[hist > 1]
    conflicts = uniqs.set_index(cols[:-1]).loc[conflicts.index].reset_index()
    return uniqs, conflicts

## County Code ⟹ County Name

In [4]:
%%time
cn_cols = [ 'cc', 'cn' ]
cc2cn, county_name_conflicts = ambiguous_mappings(c, cn_cols)
assert len(county_name_conflicts) == 0
cc2cn = cc2cn.set_index('cc')['cn']
c.groupby(cn_cols)['num'].sum()

CPU times: user 3.2 ms, sys: 1.7 ms, total: 4.9 ms
Wall time: 4.19 ms


cc  cn        
1   Atlantic      192238
2   Bergen        656350
3   Burlington    264893
4   Camden        324947
5   Cape May       65356
6   Cumberland     98515
7   Essex         610767
8   Gloucester    155854
9   Hudson        424523
10  Hunterdon      83811
11  Mercer        271333
12  Middlesex     606975
13  Monmouth      416092
14  Morris        325609
15  Ocean         340102
16  Passaic       390426
17  Salem          36922
18  Somerset      237074
19  Sussex         78639
20  Union         425445
21  Warren         71432
Name: num, dtype: int64

In [5]:
cc2cn

cc
1       Atlantic
2         Bergen
3     Burlington
4         Camden
5       Cape May
6     Cumberland
7          Essex
8     Gloucester
9         Hudson
10     Hunterdon
11        Mercer
12     Middlesex
13      Monmouth
14        Morris
15         Ocean
16       Passaic
17         Salem
18      Somerset
19        Sussex
20         Union
21        Warren
Name: cn, dtype: string

## { County Code, Muni Code } ⟹ Muni Name conflicts

In [6]:
%%time
mn_keys = [ 'cc', 'mc' ]
mn_val = 'mn'
mn_cols = mn_keys + [ mn_val ]
_, muni_name_conflicts = ambiguous_mappings(c, mn_cols)
muni_name_conflicts.set_index(mn_cols)

CPU times: user 4.56 ms, sys: 597 µs, total: 5.16 ms
Wall time: 4.44 ms


cc,mc,mn
14,30,Long Hill Twp
14,30,Passaic Twp
15,7,Dover Twp
15,7,Toms River Twp
16,16,West Paterson Boro
16,16,Woodland Park Boro


## { County Code, Muni Code, Year } ⟹ Muni Name

In [7]:
%%time
mny_keys = [ 'cc', 'mc', 'year' ]
mny_cols = mny_keys + [ mn_val ]
mny_uniqs, mny_conflicts = ambiguous_mappings(c, mny_cols)
assert len(mny_conflicts) == 0
mny_uniqs.set_index(mn_cols + ['year'])

CPU times: user 7.39 ms, sys: 1.01 ms, total: 8.4 ms
Wall time: 7.62 ms


cc,mc,mn,year
1,1,Absecon City,2001
1,1,Absecon City,2002
1,1,Absecon City,2003
1,1,Absecon City,2004
1,1,Absecon City,2005
...,...,...,...
21,23,White Twp,2017
21,23,White Twp,2018
21,23,White Twp,2019
21,23,White Twp,2020


### Use 2021 muni names as canonical

In [8]:
mn21 = (
    mny_uniqs
    [mny_uniqs.year == 2021]
    .merge(cc2cn, left_on='cc', right_index=True, how='left', validate='m:1')
    [['cc', 'cn', 'mc', 'mn']]
)
mn21

Unnamed: 0,cc,cn,mc,mn
20,1,Atlantic,1,Absecon City
41,1,Atlantic,2,Atlantic City
62,1,Atlantic,3,Brigantine City
83,1,Atlantic,4,Buena Boro
104,1,Atlantic,5,Buena Vista Twp
...,...,...,...,...
11662,21,Warren,19,Phillipsburg Town
11683,21,Warren,20,Pohatcong Twp
11704,21,Warren,21,Washington Boro
11725,21,Warren,22,Washington Twp


Extract name stems, types

In [9]:
mns = sxs(
    mn21,
    mn21.mn.str.extract(r'^(?P<name>.*?)(?: (?P<type>[^ ]+))?$'),
)
mns

Unnamed: 0,cc,cn,mc,mn,name,type
20,1,Atlantic,1,Absecon City,Absecon,City
41,1,Atlantic,2,Atlantic City,Atlantic,City
62,1,Atlantic,3,Brigantine City,Brigantine,City
83,1,Atlantic,4,Buena Boro,Buena,Boro
104,1,Atlantic,5,Buena Vista Twp,Buena Vista,Twp
...,...,...,...,...,...,...
11662,21,Warren,19,Phillipsburg Town,Phillipsburg,Town
11683,21,Warren,20,Pohatcong Twp,Pohatcong,Twp
11704,21,Warren,21,Washington Boro,Washington,Boro
11725,21,Warren,22,Washington Twp,Washington,Twp


In [10]:
mns[mns.mn == 'Princeton']

Unnamed: 0,cc,cn,mc,mn,name,type
6153,11,Mercer,14,Princeton,Princeton,


Preserve these cities' full names

In [11]:
city_stems = [ 'Atlantic', 'Jersey', 'Ocean', 'Union', ]
cities = [ f'{stem} City' for stem in city_stems ]
cities

['Atlantic City', 'Jersey City', 'Ocean City', 'Union City']

In [12]:
city_full_mask = mns.mn.isin(cities)
mns[city_full_mask]

Unnamed: 0,cc,cn,mc,mn,name,type
41,1,Atlantic,2,Atlantic City,Atlantic,City
3674,5,Cape May,8,Ocean City,Ocean,City
5210,9,Hudson,6,Jersey City,Jersey,City
5294,9,Hudson,10,Union City,Union,City


In [13]:
hills_mask = mns.type == 'Hills'
mns[hills_mask]

Unnamed: 0,cc,cn,mc,mn,name,type
8382,14,Morris,29,Parsippany-Troy Hills,Parsippany-Troy,Hills


In [14]:
cnn_dupe_mask = mns.duplicated(['cn', 'name'], keep=False)
cnn_dupes = mns[cnn_dupe_mask]
cnn_dupes

Unnamed: 0,cc,cn,mc,mn,name,type
146,1,Atlantic,7,Egg Harbor City,Egg Harbor,City
167,1,Atlantic,8,Egg Harbor Twp,Egg Harbor,Twp
2004,3,Burlington,3,Bordentown City,Bordentown,City
2025,3,Burlington,4,Bordentown Twp,Bordentown,Twp
2046,3,Burlington,5,Burlington City,Burlington,City
2067,3,Burlington,6,Burlington Twp,Burlington,Twp
2524,3,Burlington,28,Pemberton Boro,Pemberton,Boro
2545,3,Burlington,29,Pemberton Twp,Pemberton,Twp
2875,4,Camden,5,Berlin Boro,Berlin,Boro
2896,4,Camden,6,Berlin Twp,Berlin,Twp


In [15]:
full_name_mask = city_full_mask | hills_mask | cnn_dupe_mask
names = mns.copy()
names.loc[full_name_mask, 'name'] = names.loc[full_name_mask, 'mn']
names = names.drop(columns='mn')
names

Unnamed: 0,cc,cn,mc,name,type
20,1,Atlantic,1,Absecon,City
41,1,Atlantic,2,Atlantic City,City
62,1,Atlantic,3,Brigantine,City
83,1,Atlantic,4,Buena,Boro
104,1,Atlantic,5,Buena Vista,Twp
...,...,...,...,...,...
11662,21,Warren,19,Phillipsburg,Town
11683,21,Warren,20,Pohatcong,Twp
11704,21,Warren,21,Washington Boro,Boro
11725,21,Warren,22,Washington Twp,Twp


## Build cc2mc2mn dict, write as JSON

In [16]:
def county_obj(df):
    return dict(
        cn=singleton(df.cn.tolist()),
        mc2mn=df.set_index('mc')['name'].to_dict(),
    )
    
cc2mc2mn = (
    names
    .groupby('cc')
    .apply(county_obj)
    .to_dict()
)
cc2mc2mn

{1: {'cn': 'Atlantic',
  'mc2mn': {1: 'Absecon',
   2: 'Atlantic City',
   3: 'Brigantine',
   4: 'Buena',
   5: 'Buena Vista',
   6: 'Corbin',
   7: 'Egg Harbor City',
   8: 'Egg Harbor Twp',
   9: 'Estell Manor',
   10: 'Folsom',
   11: 'Galloway',
   12: 'Hamilton',
   13: 'Hammonton',
   14: 'Linwood',
   15: 'Longport',
   16: 'Margate',
   17: 'Mullica',
   18: 'Northfield',
   19: 'Pleasantville',
   20: 'Port Republic',
   21: 'Somers Point',
   22: 'Ventnor',
   23: 'Weymouth'}},
 2: {'cn': 'Bergen',
  'mc2mn': {1: 'Allendale',
   2: 'Alpine',
   3: 'Bergenfield',
   4: 'Bogota',
   5: 'Carlstadt',
   6: 'Cliffside Park',
   7: 'Closter',
   9: 'Demarest',
   10: 'Dumont',
   11: 'Elmwood Park',
   12: 'East Rutherford',
   13: 'Edgewater',
   14: 'Emerson',
   15: 'Englewood',
   16: 'Englewood Cliffs',
   17: 'Fair Lawn',
   18: 'Fairview',
   19: 'Fort Lee',
   20: 'Franklin Lakes',
   21: 'Garfield',
   22: 'Glen Rock',
   23: 'Hackensack',
   24: 'Harrington Park',
   25:

In [17]:
with open(CC2MC2MN, 'w') as f:
    json.dump(cc2mc2mn, f, indent=2)

## Muni Name transitions

In [18]:
%%time
mnys = (
    mny_uniqs
    .merge(muni_name_conflicts, on=mn_cols)
    .set_index(mny_keys)
    .sort_index()
    [mn_val]
)
mnys

CPU times: user 3.69 ms, sys: 1.14 ms, total: 4.82 ms
Wall time: 4.13 ms


cc  mc  year
14  30  2001           Passaic Twp
        2002           Passaic Twp
        2003           Passaic Twp
        2004           Passaic Twp
        2005           Passaic Twp
        2006           Passaic Twp
        2007           Passaic Twp
        2008           Passaic Twp
        2009           Passaic Twp
        2010           Passaic Twp
        2011           Passaic Twp
        2012           Passaic Twp
        2013         Long Hill Twp
        2014         Long Hill Twp
        2015         Long Hill Twp
        2016         Long Hill Twp
        2017           Passaic Twp
        2018           Passaic Twp
        2019           Passaic Twp
        2021           Passaic Twp
15  7   2001             Dover Twp
        2002             Dover Twp
        2003             Dover Twp
        2004             Dover Twp
        2005             Dover Twp
        2006             Dover Twp
        2007             Dover Twp
        2008        Toms River Twp
       

In [19]:
mnys[mnys != mnys.shift(1)]

cc  mc  year
14  30  2013         Long Hill Twp
        2017           Passaic Twp
15  7   2001             Dover Twp
        2008        Toms River Twp
16  16  2001    West Paterson Boro
        2008    Woodland Park Boro
Name: mn, dtype: string

## Verify Muni Codes auto-increment within each County

In [20]:
%%time
codes_hist = c.reset_index()[['cc', 'mc']].astype(int).value_counts(sort=False)
codes_hist

CPU times: user 1.75 ms, sys: 804 µs, total: 2.55 ms
Wall time: 1.86 ms


cc  mc
1   1     21
    2     21
    3     21
    4     21
    5     21
          ..
21  19    21
    20    21
    21    21
    22    21
    23    21
Name: count, Length: 568, dtype: int64

In [21]:
def fsck_codes(s):
    l = s.tolist()
    return l == list(range(1, len(l) + 1)), f"{l}"

assert (
    codes_hist
    .reset_index()
    .groupby('cc')
    ['mc']
    .apply(fsck_codes)
).all()