Clean all the addresses

In [1]:
import numpy as np
import pandas as pd
import re
import usaddress

from address_normalize.model import Model
from address_normalize.paths import DATA_DIR
from address_normalize.utils import jaccard_similarity, load_corpus


In [2]:
model = Model()

In [3]:
raw_dir = DATA_DIR / 'raw'

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

data = pd.read_excel(file_path, sheet_name=['Phase_I', 'Phase_II', 'Phase_III'])

corpus_dict = load_corpus(DATA_DIR / 'corpus_dict.pkl')
print(len(corpus_dict))

3802


In [4]:
phase1_df = data['Phase_I']
pending_df = pd.concat([data['Phase_II'], data['Phase_III']])

In [5]:
print(phase1_df.shape)
phase1_df.head()

(2024, 34)


Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [6]:
print(pending_df.shape)
pending_df.head()

(7465, 12)


Unnamed: 0,REF,ESITEID,Address,Town,ZIP,PON Boundary,Status,Service Speed,Eligible,LAT,LONG,Build Phase
0,1,264978,160 GREEN ST,VERGENNES,5491,LCC VERGENNES-3,Served,Served 100/20,,44.161107,-73.245427,Phase II
1,2,265443,26 THOMAS CIR,VERGENNES,5491,LCC VERGENNES-2,Served,Served 100/20,,44.156948,-73.245608,Phase II
2,3,265008,22 SUNSET DR,VERGENNES,5491,LCC VERGENNES-3,Served,Served 100/20,,44.160163,-73.245543,Phase II
3,4,184727,99 GRISWOLD LN,ORWELL,5760,LCC ORWELL-5,Underserved,Served 4/1,Eligible,43.815731,-73.247273,Phase II
4,5,139558,411 MOUNTAINVIEW DR,LEICESTER,5733,LCC LEICESTER-5,Served,Served 100/20,,43.878902,-73.070688,Phase II


In [7]:
phase1_sents = phase1_df['ADDRESS'].tolist()
pending_phase_sents = pending_df['Address'].tolist()
print(phase1_sents[:10])
print(pending_phase_sents[:10])

['114 S BINGHAM ST', '1896 OLD JERUSALEM RD', '14 CHURCH ST', '24 PARK LN', '156 MIDDLE RD', '82 OLIVER HOWE CT', '112 SCHOOL RD', '289 CROSS RD', '466 N ORWELL RD', '79 LEMON FAIR RD']
['160 GREEN ST', '26 THOMAS CIR', '22 SUNSET DR', '99 GRISWOLD LN', '411 MOUNTAINVIEW DR', '204 WALKER RD', '4215 SAND RD', '0 SWALLOW DR', '5592 US ROUTE 7', '178 HORTON RD']


In [8]:
phase1_matches = model(phase1_sents)

phase1_matches_df = pd.DataFrame(phase1_matches, columns=['phase_1_address', 'score', 'e911_address'])
phase1_matches_df.to_csv(DATA_DIR / 'processed' / 'phase1_matches_df.csv', index=False)

phase1_matches_df.sort_values(by='score', ascending=False).head(25)


No address number match for 6335 vt route 22a
No address number match for 6326 vt route 22a
No address number match for 4823 vt route 74 w
No address number match for 4821 vt route 74 w
No address number match for 4831 vt route 74 w
No address number match for 4718 vt route 74 w
No address number match for 5866 vt route 22a
No address number match for 3597 vt route 74 w
No address number match for 3593 vt route 74 w
No address number match for 5834 vt route 22a
No address number match for 3591 vt route 74 w
No address number match for 5625 vt route 22a
No address number match for 2973 hemenway hill rd
No address number match for 3474 richville rd
No address number match for 3539 richville rd
No address number match for 1358 n orwell rd
No address number match for 1990 richville rd
No address number match for 1142 school st
No address number match for 1300 smith st
No address number match for 3121 watch point rd
No address number match for 2390 leland rd
No address number match for 3327

Unnamed: 0,phase_1_address,score,e911_address
0,114 s bingham st,1.0,114 S BINGHAM ST
756,410 delong ln,1.0,410 DELONG LN
762,19 n bingham st,1.0,19 N BINGHAM ST
761,125 morse rd,1.0,125 MORSE RD
760,1402 sperry rd,1.0,1402 SPERRY RD
759,1027 halladay rd,1.0,1027 HALLADAY RD
758,732 n cream hill rd,1.0,732 N CREAM HILL RD
757,685 lapham bay rd,1.0,685 LAPHAM BAY RD
755,241 school st,1.0,241 SCHOOL ST
764,620 n cream hill rd,1.0,620 N CREAM HILL RD


In [9]:
pending_phase = model(pending_phase_sents)

pending_phase_df = pd.DataFrame(pending_phase, columns=['address', 'score', 'e911_address'])
pending_phase_df.to_csv(DATA_DIR / 'processed' / 'pending_phase_df.csv', index=False)

pending_phase_df.sort_values(by='score', ascending=False).head(25)


No address number match for 0 swallow dr
No address number match for 1314 little chicago rd
No address number match for 2680 maple st
No address number match for 5340 sand rd
No address number match for 0 keewaydin way
No address number match for 3675 monkton rd
No address number match for 3014 hawkins rd
No address number match for 3064 plank rd
No address number match for 5075 us route 7
No address number match for 0 lake dunmore rd
No address number match for 1389 hooker rd
No address number match for 0 lake dunmore rd
No address number match for 2819 maple st
No address number match for 0 lake dunmore rd
No address number match for 1424 hooker rd
No address number match for 787 kingsland bay state park rd
No address number match for 1522 maple st
No address number match for 3208 us route 7
No address number match for 0 lake dunmore rd
No address number match for 3778 monkton rd
No address number match for 4067 monkton rd
No address number match for 7569 ethan allen hwy
No address n

Unnamed: 0,address,score,e911_address
0,160 green st,1.0,160 GREEN St
6231,44 birchard park,1.0,44 BIRCHARD PARK
6225,527 buttolph dr,1.0,527 BUTTOLPH Dr
6226,2776 upper plains rd,1.0,2776 UPPER PLAINS RD
3491,1646 hooker rd,1.0,1646 HOOKER RD
3490,404 underwood ln,1.0,404 UNDERWOOD LN
6227,194 billings farm rd,1.0,194 BILLINGS FARM RD
3488,329 mountainview dr,1.0,329 MOUNTAINVIEW DR
6228,106 s main st,1.0,106 S MAIN St
3484,145 lake dunmore rd,1.0,145 LAKE DUNMORE RD
