In [1]:
from utz import *
import geopandas as gpd
from nj_crashes.paths import COUNTY_CITY_CODES_PQT
from njdot.crashes import name_renames
import njdot
import njsp
from njdot.paths import CC2MC2MN
from njdot import YEARS, Data
from njdot.load import pk_astype, pk_renames

## Load DOT county/muni codes

In [2]:
data = Data(types=['Accidents'], columns=['County Code', 'County Name', 'Municipality Code', 'Municipality Name'])
c = data.df(index=False)
c = c.value_counts(c.columns.tolist()).sort_index().rename('num').reset_index()
c = c.rename(columns={
    k: v for k, v in {
        'Year': 'year',
        **pk_renames,
        **name_renames,
    }.items()
    if k in c
}).astype(pk_astype)
c['cn'] = c['cn'].str.title()
c['mn'] = c['mn'].str.title()
c = c[['year'] + [k for k in c if k != 'year']]
c

Unnamed: 0,year,cc,cn,mc,mn,num
0,2001,1,Atlantic,1,Absecon City,312
1,2002,1,Atlantic,1,Absecon City,367
2,2003,1,Atlantic,1,Absecon City,371
3,2004,1,Atlantic,1,Absecon City,322
4,2005,1,Atlantic,1,Absecon City,352
...,...,...,...,...,...,...
12299,2018,21,Warren,23,White Twp,156
12300,2019,21,Warren,23,White Twp,141
12301,2020,21,Warren,23,White Twp,121
12302,2021,21,Warren,23,White Twp,105


Helper that returns:
1. unique sets of values found in `cols`
2. rows from 1. that match all columns but the last

In [3]:
def ambiguous_mappings(df, cols):
    uniqs = (
        df
        #.reset_index()
        [cols]
        .drop_duplicates()
        .sort_values(cols)
    )
    hist = uniqs.value_counts(cols[:-1], sort=False)
    conflicts = hist[hist > 1]
    conflicts = uniqs.set_index(cols[:-1]).loc[conflicts.index].reset_index()
    return uniqs, conflicts

### County Code ⟹ County Name

In [4]:
cn_cols = [ 'cc', 'cn' ]
cc2cn, county_name_conflicts = ambiguous_mappings(c, cn_cols)
assert len(county_name_conflicts) == 0
cc2cn = cc2cn.set_index('cc')['cn']
assert cc2cn.to_dict() == njdot.data.cc2cn
c.groupby(cn_cols)['num'].sum()

cc  cn        
1   Atlantic      199592
2   Bergen        682254
3   Burlington    275382
4   Camden        338799
5   Cape May       67876
6   Cumberland    102238
7   Essex         637442
8   Gloucester    162997
9   Hudson        440379
10  Hunterdon      86997
11  Mercer        280890
12  Middlesex     632026
13  Monmouth      431841
14  Morris        336447
15  Ocean         355144
16  Passaic       406831
17  Salem          38454
18  Somerset      245646
19  Sussex         81456
20  Union         443518
21  Warren         73585
Name: num, dtype: int64

### { County Code, Muni Code, Year } ⟹ Muni Name

In [5]:
mny_keys = [ 'cc', 'mc', 'year' ]
mn_keys = [ 'cc', 'mc' ]
mn_val = 'mn'
mn_cols = mn_keys + [ mn_val ]
mny_cols = mny_keys + [ mn_val ]
mny_uniqs, mny_conflicts = ambiguous_mappings(c, mny_cols)
assert len(mny_conflicts) == 0
mny_uniqs.set_index(mn_cols + ['year'])

cc,mc,mn,year
1,1,Absecon City,2001
1,1,Absecon City,2002
1,1,Absecon City,2003
1,1,Absecon City,2004
1,1,Absecon City,2005
...,...,...,...
21,23,White Twp,2018
21,23,White Twp,2019
21,23,White Twp,2020
21,23,White Twp,2021


### Check NJDOT county/muni codes

#### { County Code, Muni Code } ⟹ Muni Name conflicts

In [6]:
_, muni_name_conflicts = ambiguous_mappings(c, mn_cols)
muni_name_conflicts.set_index(mn_cols)

cc,mc,mn
14,30,Long Hill Twp
14,30,Passaic Twp
15,7,Dover Twp
15,7,Toms River Twp
16,16,West Paterson Boro
16,16,Woodland Park Boro


#### Muni Name transitions

In [7]:
mnys = (
    mny_uniqs
    .merge(muni_name_conflicts, on=mn_cols)
    .set_index(mny_keys)
    .sort_index()
    [mn_val]
)
mnys

cc  mc  year
14  30  2001           Passaic Twp
        2002           Passaic Twp
        2003           Passaic Twp
        2004           Passaic Twp
        2005           Passaic Twp
        2006           Passaic Twp
        2007           Passaic Twp
        2008           Passaic Twp
        2009           Passaic Twp
        2010           Passaic Twp
        2011           Passaic Twp
        2012           Passaic Twp
        2013         Long Hill Twp
        2014         Long Hill Twp
        2015         Long Hill Twp
        2016         Long Hill Twp
        2017           Passaic Twp
        2018           Passaic Twp
        2019           Passaic Twp
        2021           Passaic Twp
        2022           Passaic Twp
15  7   2001             Dover Twp
        2002             Dover Twp
        2003             Dover Twp
        2004             Dover Twp
        2005             Dover Twp
        2006             Dover Twp
        2007             Dover Twp
       

In [8]:
mnys[mnys != mnys.shift(1)]

cc  mc  year
14  30  2013         Long Hill Twp
        2017           Passaic Twp
15  7   2001             Dover Twp
        2008        Toms River Twp
16  16  2001    West Paterson Boro
        2008    Woodland Park Boro
Name: mn, dtype: string

### Use 2021 muni names as canonical

In [9]:
mn21 = (
    mny_uniqs
    [['cc', 'mc', 'mn', 'year']]
    .groupby(['cc', 'mc'])
    .apply(lambda df: df.sort_values('year').iloc[-1].mn)
    .rename('mn')
    .reset_index()
    .merge(cc2cn, left_on='cc', right_index=True, how='left', validate='m:1')
    [['cc', 'cn', 'mc', 'mn']]
)

# Fix typos / normalize muni names
mn_nits = {
    'Mount Ephriam Boro': 'Mount Ephraim Boro',
    'Ho Ho Kus Boro': 'Ho-Ho-Kus Boro',
    'Lower Alloways Crk': 'Lower Alloways Creek',
    'Sandvston Twp': 'Sandyston Twp',
    'Passaic Twp': 'Long Hill Twp',  # renamed in 1992; https://en.wikipedia.org/wiki/Long_Hill_Township,_New_Jersey
    'Orange City': 'Orange Twp',
    'Avon-By-The-Sea Boro': 'Avon-by-the-Sea Boro',
    'South Orange Village Twp': 'South Orange Village',
    'Pt Pleasant Beach Boro': 'Point Pleasant Beach Boro',
    'South Orange Village Twp': 'South Orange Twp',
    'Lower Alloways Crk Twp': 'Lower Alloways Creek Twp',
    # Princeton Twp and Boro merged in 2013: https://en.wikipedia.org/wiki/Princeton,_New_Jersey
    'Princeton Twp': 'Princeton',
    'Princeton Boro': 'Princeton',
    # Dissolved/Merged in 2022: https://en.wikipedia.org/wiki/Pine_Valley,_New_Jersey
    'Pine Valley Boro': 'Pine Hill Boro',
    # Dissolved/Merged in 1997: https://en.wikipedia.org/wiki/Pahaquarry_Township,_New_Jersey
    'Pahaquarry Twp': 'Hardwick Twp',
}
for src, dst in mn_nits.items():
    mn21['mn'] = mn21['mn'].replace(src, dst, regex=False)
mn21

Unnamed: 0,cc,cn,mc,mn
0,1,Atlantic,1,Absecon City
1,1,Atlantic,2,Atlantic City
2,1,Atlantic,3,Brigantine City
3,1,Atlantic,4,Buena Boro
4,1,Atlantic,5,Buena Vista Twp
...,...,...,...,...
563,21,Warren,19,Phillipsburg Town
564,21,Warren,20,Pohatcong Twp
565,21,Warren,21,Washington Boro
566,21,Warren,22,Washington Twp


## Load NJSP county/muni codes

In [10]:
suffixes = [ 'Boro', 'City', 'Village', 'Twp', 'Town', ]

In [11]:
from njsp.cli.update_pqts import get_crashes_df

renames = {
    'CCODE': 'cc',
    'MCODE': 'mc',
    'CNAME': 'cn',
    'MNAME': 'mn',
    'FATALITIES': 'tk',
    'INJURIES': 'ti',
    'FATAL_D': 'dk',
    'FATAL_P': 'ok',
    'FATAL_T': 'pk',
    'FATAL_B': 'bk',
    **{
        c: c.lower()
        for c in ['STREET', 'HIGHWAY', 'LOCATION']
    },
}

def parse_mc(r):
    assert r.mc[:2] == r.cc
    return r.mc[2:]

sp = get_crashes_df()[0].rename(columns=renames)
sp['mc'] = sp.apply(parse_mc, axis=1)
sp = sp.astype({ 'cc': int, 'mc': int })
sp = sp[['dt'] + list(renames.values())]
sp['mn'] = sp.mn.replace(' Twsp?$', ' Twp', regex=True)

for tpe in suffixes:
    full_suffix = f' {tpe}'
    for idx in range(1, len(tpe)):
        suffix = f' {tpe[:idx]}$'
        sp['mn'] = sp.mn.replace(suffix, full_suffix, regex=True)

for src, dst in {
    'Easthampton Twp': 'Eastampton Twp',
    'Hohokus Boro': 'Ho-Ho-Kus Boro',
    'Ridgewood Twp': 'Ridgewood Village',
    'Ridgefield Park Twp': 'Ridgefield Park Village',
    'Parsippany-Troy Hil': 'Parsippany-Troy Hills',
    'Lower Alloways Cree': 'Lower Alloways Creek',
    'Orange City': 'Orange Twp',
    'Avon-By-The-Sea Boro': 'Avon-by-the-Sea Boro',
    'South Orange Village': 'South Orange Twp', 
    'Point Pleasant Beac': 'Point Pleasant Beach',
}.items():
    sp['mn'] = sp.mn.replace(src, dst, regex=False)
sp

      accidents  injuries  fatalities
year                                 
2008        555       414         590
2009        550       352         584
2010        530       366         556
2011        586       517         627
2012        553       382         589
2013        508       393         542
2014        523       345         556
2015        522       374         562
2016        570       398         602
2017        591       368         624
2018        524       358         563
2019        524       340         558
2020        550       263         587
2021        667       397         697
2022        646       463         689
2023        574       371         606
2024        654       220         691
2025         91        14          97
      CCODE     CNAME MCODE               MNAME HIGHWAY  \
ACCID                                                     
1703     01  Atlantic  0102       Atlantic City     446   
1681     09    Hudson  0910          Union City     NaN   
1659

Unnamed: 0_level_0,dt,cc,mc,cn,mn,tk,ti,dk,ok,pk,bk,street,highway,location
ACCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1703,2008-01-01 00:35:00-05:00,1,2,Atlantic,Atlantic City,1.0,1.0,,,,,,446,State/Interstate Authority 446 S MP 1
1681,2008-01-01 04:11:00-05:00,9,10,Hudson,Union City,1.0,,,,,,Bergenline Ave,,Bergenline Ave S MP 0 at 6th St
1659,2008-01-01 06:46:00-05:00,4,15,Camden,Gloucester Twp,1.0,1.0,,,,,,42,State Highway 42 N MP 8.2
1661,2008-01-01 12:29:00-05:00,20,4,Union,Elizabeth City,1.0,1.0,,,,,,624,County 624 W MP 2.2 at Ikea Dr
1811,2008-01-01 18:53:00-05:00,7,16,Essex,Nutley Town,1.0,,,,,,,648,County 648 E MP .87 at Franklin Ave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13799,2025-03-09 18:15:00-04:00,11,3,Mercer,Hamilton Twp,1.0,,1.0,0.0,0.0,0.0,,650,County 650
13797,2025-03-10 07:38:00-04:00,9,6,Hudson,Jersey City,1.0,0.0,0.0,0.0,1.0,0.0,Summit Ave,,Summit Ave
13802,2025-03-10 16:20:00-04:00,16,16,Passaic,Woodland Park Boro,1.0,,0.0,0.0,0.0,1.0,,639,County 639
13801,2025-03-11 00:11:00-04:00,20,9,Union,Linden City,1.0,,1.0,0.0,0.0,0.0,,95,New Jersey Turnpike MP 98


## Load NJGIN muni codes
See [Municipal_Boundaries_of_NJ.geojson]('../www/public/Municipal_Boundaries_of_NJ.geojson'), downloaded from [this NJGIN ArcGIS layer](https://njogis-newjersey.opendata.arcgis.com/datasets/3d5d1db8a1b34b418c331f4ce1fd0fef/explore).

In [12]:
from nj_crashes import load_munis_geojson
mdf = load_munis_geojson().reset_index()
mn2 = mdf.NAME.rename('mn')
for src, dst in {
    'Borough': 'Boro',
    'Township': 'Twp',
}.items():
    mn2 = mn2.replace(f' {src}$', f' {dst}', regex=True)
for src, dst in {
    'South Orange Village Twp': 'South Orange Twp',
    'Boonton': 'Boonton Town',
    'City of Orange Twp': 'Orange Twp',
}.items():
    mn2 = mn2.replace(src, dst, regex=False)

cc2 = mdf.cc
cn2 = mdf.COUNTY.str.title().rename('cn')
mc2 = mdf.mc
mc2

Running: dvc pull www/public/Municipal_Boundaries_of_NJ.geojson


A       www/public/Municipal_Boundaries_of_NJ.geojson
1 file added and 1 file fetched


0       3
1      12
2       2
3      15
4      13
       ..
559    22
560    27
561    12
562     8
563    21
Name: mc, Length: 564, dtype: int64

## Align county and muni codes (NJDOT, NJSP, NJGIN)

In [13]:
def split_stem_suffix(r):
    for suffix in suffixes:
        if r.mn.endswith(f' {suffix}'):
            return Series(dict(
                stem=r.mn[:-(len(suffix) + 1)],
                type=suffix,
            ))
    return dict(stem=r.mn, type=None)

def add_stems(df, id_name):
    df = df[['cc', 'cn', 'mc', 'mn']].drop_duplicates()
    df = sxs(df, df.apply(split_stem_suffix, axis=1)).sort_values(['cc', 'mc']).reset_index(drop=True)
    dupe_mask = df.duplicated(keep='last', subset=['cc', 'mc'])
    dupes = df[dupe_mask]
    if not dupes.empty:
        all_dupes = df[df.duplicated(keep=False, subset=['cc', 'mc'])]
        err(f"Dropping {len(dupes)} non-last duplicate (cc,mc) entries. All dupes:")
        err(str(all_dupes))
    df = df[~dupe_mask]
    assert df[df.duplicated(keep=False, subset=['cc', 'mc'])].empty
    df.index.name = id_name
    return df

In [14]:
df0 = add_stems(mn21, 'dot')
df0

Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
dot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,Atlantic,1,Absecon City,Absecon,City
1,1,Atlantic,2,Atlantic City,Atlantic,City
2,1,Atlantic,3,Brigantine City,Brigantine,City
3,1,Atlantic,4,Buena Boro,Buena,Boro
4,1,Atlantic,5,Buena Vista Twp,Buena Vista,Twp
...,...,...,...,...,...,...
563,21,Warren,19,Phillipsburg Town,Phillipsburg,Town
564,21,Warren,20,Pohatcong Twp,Pohatcong,Twp
565,21,Warren,21,Washington Boro,Washington,Boro
566,21,Warren,22,Washington Twp,Washington,Twp


In [15]:
df1 = add_stems(sp, 'sp')
df1

Dropping 4 non-last duplicate (cc,mc) entries. All dupes:
     cc        cn  mc                mn             stem  type
270  11    Mercer  10     Princeton Twp        Princeton   Twp
271  11    Mercer  10         Princeton        Princeton  None
273  11    Mercer  12    Washington Twp       Washington   Twp
274  11    Mercer  12  Robbinsville Twp     Robbinsville   Twp
306  13  Monmouth   9   Brielle Borowsp  Brielle Borowsp  None
307  13  Monmouth   9      Brielle Boro          Brielle  Boro
351  14    Morris  10  East Honover Twp     East Honover   Twp
352  14    Morris  10  East Hanover Twp     East Hanover   Twp


Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
sp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,Atlantic,1,Absecon City,Absecon,City
1,1,Atlantic,2,Atlantic City,Atlantic,City
2,1,Atlantic,3,Brigantine City,Brigantine,City
3,1,Atlantic,4,Buena Boro,Buena,Boro
4,1,Atlantic,5,Buena Vista Twp,Buena Vista,Twp
...,...,...,...,...,...,...
515,21,Warren,19,Phillipsburg Town,Phillipsburg,Town
516,21,Warren,20,Pohatcong Twp,Pohatcong,Twp
517,21,Warren,21,Washington Boro,Washington,Boro
518,21,Warren,22,Washington Twp,Washington,Twp


In [16]:
df2 = add_stems(sxs(cc2, cn2, mc2, mn2), 'gin')
df2

Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
gin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,Atlantic,1,Absecon,Absecon,
1,1,Atlantic,2,Atlantic City,Atlantic,City
2,1,Atlantic,3,Brigantine,Brigantine,
3,1,Atlantic,4,Buena Boro,Buena,Boro
4,1,Atlantic,5,Buena Vista Twp,Buena Vista,Twp
...,...,...,...,...,...,...
559,21,Warren,19,Phillipsburg,Phillipsburg,
560,21,Warren,20,Pohatcong Twp,Pohatcong,Twp
561,21,Warren,21,Washington Boro,Washington,Boro
562,21,Warren,22,Washington Twp,Washington,Twp


In [17]:
def align(l, r, validate1='1:1'):
    on = [ 'cn', 'mn', ]
    common = [ 'cc', 'mc', 'type', ]
    cols = [ *on, *common ]
    ln = l.index.name
    rn = r.index.name
    lr1 = (
        l
        .reset_index()
        [[ln] + cols].merge(
            r
            .reset_index()
            [[rn] + cols],
            on=on,
            suffixes=[f'_{ln}', f'_{rn}'],
            validate=validate1,
        )
    )

    lcc = f'cc_{ln}'
    rcc = f'cc_{rn}'
    lmc = f'mc_{ln}'
    rmc = f'mc_{rn}'
    ltc = f'type_{ln}'
    rtc = f'type_{rn}'
    lt = lr1[ltc]
    rt = lr1[rtc]
    
    assert (lr1[lcc] == lr1[rcc]).all()
    types_match = (lt == rt) | (lt.isna() & rt.isna())
    assert (types_match).all(), lr1[~types_match]
    mc_match_hist = (lr1[lmc] == lr1[rmc]).value_counts()
    err(f"Found {len(lr1)} exact ({','.join(on)}) matches from {len(l)} {ln} and {len(r)} {rn} entries")

    cc = lr1[lcc].rename('cc')
    tpe = lr1[ltc].rename('type')
    mcl = lr1[lmc]
    mcr = lr1[rmc]
    m1 = sxs(cc, mcl, mcr, tpe)

    # Filter out exact matches, re-match on "stems"
    l2 = l[~l.index.isin(lr1[ln])]
    r2 = r[~r.index.isin(lr1[rn])]
    l2_dupes = l2[l2.duplicated(keep=False, subset=['cn', 'stem'])]
    r2_dupes = r2[r2.duplicated(keep=False, subset=['cn', 'stem'])]
    assert l2_dupes.empty, f"{len(l2_dupes)} (cn,stem) dupes found:\n{l2_dupes}"
    assert r2_dupes.empty, f"{len(r2_dupes)} (cn,stem) dupes found:\n{r2_dupes}"

    on = [ 'cn', 'stem', ]
    cols2 = [ *on, *common, ]
    lr2 = (
        l2
        .reset_index()
        [[ln] + cols2]
        .merge(
            r2
            .reset_index()
            [[rn] + cols2],
            on=on,
            suffixes=[f'_{ln}', f'_{rn}'],
            validate='1:1',
        )
    )

    assert (lr2[lcc] == lr2[rcc]).all()
    mc_match_hist = (lr2[lmc] == lr2[rmc]).value_counts()
    err(f"Found {len(lr2)} ({','.join(on)}) matches from {len(l2)} {ln} and {len(r2)} {rn} entries")

    l3 = l2[~l2.index.isin(lr2[ln])]
    r3 = r2[~r2.index.isin(lr2[rn])]
    assert l3.empty, f"Found {len(l3)} unaligned items from l:\n{l3}"

    err(f'{ln}: {len(l)} entries, {len(lr1)} exact matches, {len(lr2)} stem matches, {len(l3)} unmatched')
    err(f'{rn}: {len(r)} entries, {len(lr1)} exact matches, {len(lr2)} stem matches, {len(r3)} unmatched')

    cc = lr2[lcc].rename('cc')
    mcl = lr2[lmc]
    mcr = lr2[rmc]
    tcl = lr2[ltc]
    tcr = lr2[rtc]

    m2 = sxs(cc, mcl, mcr)
    m2['type'] = tcr  # default to right `type`
    has_tcl = ~tcl.isna()
    has_tcr = ~tcr.isna()
    m2.loc[has_tcl & ~has_tcr, 'type'] = tcl  # fallback to left `type`
    m2t = m2.merge(r[['cc', 'mc', 'stem']], left_on=['cc', rmc], right_on=['cc', 'mc'], how='left').drop(columns='mc')
    type_conflicts = sxs(m2t.drop(columns='type'), tcl, tcr)[(tcl != tcr) & has_tcl & has_tcr]
    if not type_conflicts.empty:
        err(f"{len(type_conflicts)} conflicting types:")
        err(str(type_conflicts))

    m = pd.concat([m1, m2])
    m = m.merge(r[['cc', 'mc', 'stem']], left_on=['cc', rmc], right_on=['cc', 'mc'], how='left').drop(columns='mc')
    m['mn'] = m.apply(lambda r: r.stem + (f' {r["type"]}' if r["type"] else ''), axis=1)
    err(f"{(m[lmc] != m[rmc]).sum()} mc's don't match")
    return m

In [18]:
m12 = align(df1, df2)
m12

Found 446 exact (cn,mn) matches from 516 sp and 564 gin entries
Found 70 (cn,stem) matches from 70 sp and 118 gin entries
sp: 516 entries, 446 exact matches, 70 stem matches, 0 unmatched
gin: 564 entries, 446 exact matches, 70 stem matches, 48 unmatched
10 conflicting types:
    cc  mc_sp  mc_gin           stem type_sp type_gin
21   7      1       1     Belleville    Town      Twp
22   7      2       2     Bloomfield    Boro      Twp
23   7      4       7      Fairfield    Boro      Twp
25   7      9       9      Irvington    Town      Twp
26   7     13      13      Montclair    Town      Twp
28   7     16      16         Nutley    Town      Twp
29   7     20      20         Verona    Boro      Twp
30   7     21      21  West Caldwell    Boro      Twp
31   7     22      22    West Orange    Town      Twp
48  13     39      18         Hazlet    Boro      Twp
105 mc's don't match


Unnamed: 0,cc,mc_sp,mc_gin,type,stem,mn
0,1,2,2,City,Atlantic,Atlantic City
1,1,4,4,Boro,Buena,Buena Boro
2,1,5,5,Twp,Buena Vista,Buena Vista Twp
3,1,7,7,City,Egg Harbor,Egg Harbor City
4,1,8,8,Twp,Egg Harbor,Egg Harbor Twp
...,...,...,...,...,...,...
511,20,18,18,City,Summit,Summit City
512,20,20,20,Town,Westfield,Westfield Town
513,21,3,3,Town,Belvidere,Belvidere Town
514,21,8,8,Town,Hackettstown,Hackettstown Town


In [19]:
sp2gin = m12[['cc', 'mc_sp', 'mc_gin']]
sp2gin.to_parquet(njsp.paths.MC_PQT)
sp2gin

Unnamed: 0,cc,mc_sp,mc_gin
0,1,2,2
1,1,4,4
2,1,5,5
3,1,7,7
4,1,8,8
...,...,...,...
511,20,18,18
512,20,20,20
513,21,3,3
514,21,8,8


In [20]:
m02 = align(df0, df2, validate1='m:1')
m02

Found 509 exact (cn,mn) matches from 568 dot and 564 gin entries
Found 59 (cn,stem) matches from 59 dot and 59 gin entries
dot: 568 entries, 509 exact matches, 59 stem matches, 0 unmatched
gin: 564 entries, 509 exact matches, 59 stem matches, 0 unmatched
2 conflicting types:
    cc  mc_dot  mc_gin       stem type_dot type_gin
23   7       7       7  Fairfield     Boro      Twp
35  10      20      20    Milford      Twp     Boro
103 mc's don't match


Unnamed: 0,cc,mc_dot,mc_gin,type,stem,mn
0,1,2,2,City,Atlantic,Atlantic City
1,1,4,4,Boro,Buena,Buena Boro
2,1,5,5,Twp,Buena Vista,Buena Vista Twp
3,1,6,6,City,Corbin,Corbin City
4,1,7,7,City,Egg Harbor,Egg Harbor City
...,...,...,...,...,...,...
563,20,18,18,City,Summit,Summit City
564,20,20,20,Town,Westfield,Westfield Town
565,21,3,3,Town,Belvidere,Belvidere Town
566,21,8,8,Town,Hackettstown,Hackettstown Town


In [21]:
dot2gin = m02[['cc', 'mc_dot', 'mc_gin']]
dot2gin.to_parquet(njdot.paths.MC_PQT)
dot2gin

Unnamed: 0,cc,mc_dot,mc_gin
0,1,2,2
1,1,4,4
2,1,5,5
3,1,6,6
4,1,7,7
...,...,...,...
563,20,18,18
564,20,20,20
565,21,3,3
566,21,8,8


In [22]:
m = (
    m12
    .merge(
        m02,
        on=['cc', 'mc_gin'],
        how='outer',
        suffixes=['_sp', '_dot']
    )
    .sort_values(['cc', 'mc_gin'])
    .astype({
        'mc_sp': 'Int8',
        'mc_dot': 'Int8',
    })
)
assert ((m.stem_sp == m.stem_dot) | m.stem_sp.isna() | m.stem_dot.isna()).all()
assert ((m.type_sp == m.type_dot) | m.type_sp.isna() | m.type_dot.isna()).all()

m['stem'] = m['stem_sp']
m.loc[m.stem.isna() & ~m.stem_dot.isna(), 'stem'] = m.stem_dot
m['type'] = m['type_sp']
m.loc[m.type.isna() & ~m.type_dot.isna(), 'type'] = m.type_dot
m = m.drop(columns=[
    f'{c}_{t}'
    for c in [ 'stem', 'type', 'mn', ]
    for t in [ 'dot', 'sp', ]
])
m['mn'] = m.apply(lambda r: r.stem + (f' {r["type"]}' if r["type"] else ''), axis=1)
m = m.merge(cc2cn, left_on='cc', right_index=True, how='left', validate='m:1')
m = m[['cc', 'cn', 'mc_gin', 'mc_dot', 'mc_sp', 'mn', 'stem', 'type']]
m

Unnamed: 0,cc,cn,mc_gin,mc_dot,mc_sp,mn,stem,type
450,1,Atlantic,1,1,1,Absecon City,Absecon,City
0,1,Atlantic,2,2,2,Atlantic City,Atlantic,City
451,1,Atlantic,3,3,3,Brigantine City,Brigantine,City
1,1,Atlantic,4,4,4,Buena Boro,Buena,Boro
2,1,Atlantic,5,5,5,Buena Vista Twp,Buena Vista,Twp
...,...,...,...,...,...,...,...,...
519,21,Warren,19,19,19,Phillipsburg Town,Phillipsburg,Town
446,21,Warren,20,20,20,Pohatcong Twp,Pohatcong,Twp
447,21,Warren,21,21,21,Washington Boro,Washington,Boro
448,21,Warren,22,22,22,Washington Twp,Washington,Twp


In [23]:
m[m.type.isna()]

Unnamed: 0,cc,cn,mc_gin,mc_dot,mc_sp,mn,stem,type
229,11,Mercer,14,9,10,Princeton,Princeton,
230,11,Mercer,14,10,10,Princeton,Princeton,
231,11,Mercer,14,14,10,Princeton,Princeton,


In [24]:
m.type.value_counts(dropna=False)

type
Boro       253
Twp        242
City        52
Town        15
Village      3
None         3
Name: count, dtype: int64

In [25]:
pd.crosstab((m.mc_sp == m.mc_gin).rename('sp == gin'), (m.mc_dot == m.mc_gin).rename('dot == gin'))

dot == gin,False,True
sp == gin,Unnamed: 1_level_1,Unnamed: 2_level_1
False,76,31
True,10,403


## Drop "Township"/"City"/etc. suffixes
Except in a few cases, e.g. "Jersey City"

In [26]:
df2[df2.mn == 'Princeton']

Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
gin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
294,11,Mercer,14,Princeton,Princeton,


Preserve these cities' full names

In [27]:
city_stems = [ 'Atlantic', 'Jersey', 'Ocean', 'Union', ]
cities = [ f'{stem} City' for stem in city_stems ]
cities

['Atlantic City', 'Jersey City', 'Ocean City', 'Union City']

In [28]:
city_full_mask = df2.mn.isin(cities)
df2[city_full_mask]

Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
gin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,Atlantic,2,Atlantic City,Atlantic,City
176,5,Cape May,8,Ocean City,Ocean,City
250,9,Hudson,6,Jersey City,Jersey,City
254,9,Hudson,10,Union City,Union,City


In [29]:
cnn_dupe_mask = df2.duplicated(['cc', 'stem'], keep=False)
cnn_dupes = df2[cnn_dupe_mask]
cnn_dupes

Unnamed: 0_level_0,cc,cn,mc,mn,stem,type
gin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,1,Atlantic,7,Egg Harbor City,Egg Harbor,City
7,1,Atlantic,8,Egg Harbor Twp,Egg Harbor,Twp
95,3,Burlington,3,Bordentown,Bordentown,
96,3,Burlington,4,Bordentown Twp,Bordentown,Twp
97,3,Burlington,5,Burlington,Burlington,
98,3,Burlington,6,Burlington Twp,Burlington,Twp
120,3,Burlington,28,Pemberton Boro,Pemberton,Boro
121,3,Burlington,29,Pemberton Twp,Pemberton,Twp
137,4,Camden,5,Berlin Boro,Berlin,Boro
138,4,Camden,6,Berlin Twp,Berlin,Twp


In [30]:
full_name_mask = city_full_mask | cnn_dupe_mask
names = df2.copy()
names['name'] = names.stem
names.loc[full_name_mask, 'name'] = names.loc[full_name_mask, 'mn']
names = names.drop(columns='mn')
names

Unnamed: 0_level_0,cc,cn,mc,stem,type,name
gin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,Atlantic,1,Absecon,,Absecon
1,1,Atlantic,2,Atlantic,City,Atlantic City
2,1,Atlantic,3,Brigantine,,Brigantine
3,1,Atlantic,4,Buena,Boro,Buena
4,1,Atlantic,5,Buena Vista,Twp,Buena Vista
...,...,...,...,...,...,...
559,21,Warren,19,Phillipsburg,,Phillipsburg
560,21,Warren,20,Pohatcong,Twp,Pohatcong
561,21,Warren,21,Washington,Boro,Washington Boro
562,21,Warren,22,Washington,Twp,Washington Twp


In [31]:
names.to_parquet(COUNTY_CITY_CODES_PQT)

## Build cc2mc2mn dict, write as JSON

In [32]:
def county_obj(df):
    return dict(
        cn=singleton(df.cn.tolist()),
        mc2mn=df.set_index('mc')['name'].to_dict(),
    )
    
cc2mc2mn = (
    names
    .groupby('cc')
    .apply(county_obj)
    .to_dict()
)
cc2mc2mn

{1: {'cn': 'Atlantic',
  'mc2mn': {1: 'Absecon',
   2: 'Atlantic City',
   3: 'Brigantine',
   4: 'Buena',
   5: 'Buena Vista',
   6: 'Corbin',
   7: 'Egg Harbor City',
   8: 'Egg Harbor Twp',
   9: 'Estell Manor',
   10: 'Folsom',
   11: 'Galloway',
   12: 'Hamilton',
   13: 'Hammonton',
   14: 'Linwood',
   15: 'Longport',
   16: 'Margate',
   17: 'Mullica',
   18: 'Northfield',
   19: 'Pleasantville',
   20: 'Port Republic',
   21: 'Somers Point',
   22: 'Ventnor',
   23: 'Weymouth'}},
 2: {'cn': 'Bergen',
  'mc2mn': {1: 'Allendale',
   2: 'Alpine',
   3: 'Bergenfield',
   4: 'Bogota',
   5: 'Carlstadt',
   6: 'Cliffside Park',
   7: 'Closter',
   8: 'Cresskill',
   9: 'Demarest',
   10: 'Dumont',
   11: 'Elmwood Park',
   12: 'East Rutherford',
   13: 'Edgewater',
   14: 'Emerson',
   15: 'Englewood',
   16: 'Englewood Cliffs',
   17: 'Fair Lawn',
   18: 'Fairview',
   19: 'Fort Lee',
   20: 'Franklin Lakes',
   21: 'Garfield',
   22: 'Glen Rock',
   23: 'Hackensack',
   24: 'Harri

In [33]:
with open(CC2MC2MN, 'w') as f:
    json.dump(cc2mc2mn, f, indent=2)