Parse the addresses, then have a sequential matching mechanism:
1. First match on number
2. Then out of that set match on closes street

In [14]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search


In [1]:
import numpy as np
import pandas as pd

from address_normalize.paths import DATA_DIR

raw_dir = DATA_DIR / 'raw'

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

data = pd.read_excel(file_path, sheet_name=['E911 Data', 'Phase_I', 'Phase_II', 'Phase_III'])

df_e911 = data['E911 Data']
main_df = data['Phase_I']

In [2]:
main_df.head()

Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [3]:
df_e911['E911 ADDRESS']

0                32 BOOTH WOODS
1                        8 E St
2                  73 SCHOOL St
3                   206 MAIN St
4                   224 MAIN St
                  ...          
17430    574 ROBEROUTE YOUNG RD
17431                          
17432          151 VT R0UTE 116
17433         71 SPRING HILL Ln
17434               14 TATRO RD
Name: E911 ADDRESS, Length: 17435, dtype: object

In [4]:
sents = df_e911['E911 ADDRESS'].tolist()

In [5]:
df_e911.head()

Unnamed: 0,OBJECTID_1,E911 ADDRESS,OBJECTID,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST CITY,GRAND LIST STATE,GRAND LIST ZIPCODE,SPAN,...,GLVAL_HS,GLVAL_NR,CRHOUSPCT,MUNGL1PCT,AOEGL_HS,AOEGL_NR,SHAPESTAre,SHAPESTLen,Shape_Length,Shape_Area
0,1,32 BOOTH WOODS,18621475,STAPLES WAYNE,MCENTEE SHEILA,32 BOOTH WOODS,VERGENNES,VT,5491,663-210-10001,...,1837,0,0,1837,1837,0,1231.173523,145.763504,145.763504,1231.173529
1,2,8 E St,18621476,LINCOLN GERONIMO LLC,,1341 ARNOLD BAY ROAD,PANTON,VT,5491,663-210-10002,...,0,1755,0,1755,0,1755,532.200386,92.414705,92.414705,532.200394
2,3,73 SCHOOL St,18621477,WHITNEY GRACE,,PO BOX 541,CROTON FALLS,NY,10519,663-210-10003,...,0,2213,0,2213,0,2213,880.616867,118.855189,118.855189,880.616861
3,4,206 MAIN St,18621478,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10004,...,0,2988,0,3284,0,2988,709.60244,109.360601,109.360601,709.602436
4,5,224 MAIN St,18621479,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10005,...,0,3314,0,3682,0,3314,1242.662354,167.708389,167.708389,1242.662357


In [11]:
corpus_sents = df_e911['E911 ADDRESS'].tolist()
query_sents = main_df['ADDRESS'].tolist()
print(corpus_sents[:10])
print(query_sents[:10])

['32 BOOTH WOODS', '8 E St', '73 SCHOOL St', '206 MAIN St', '224 MAIN St', '38 W St', '23 SUNSET Dr', 'PANTON RD', '67 NEW HAVEN RD', 'PANTON RD']
['114 S BINGHAM ST', '1896 OLD JERUSALEM RD', '14 CHURCH ST', '24 PARK LN', '156 MIDDLE RD', '82 OLIVER HOWE CT', '112 SCHOOL RD', '289 CROSS RD', '466 N ORWELL RD', '79 LEMON FAIR RD']


In [12]:
model = SentenceTransformer('arinze/address-match-abp-v2')

In [13]:
corpus_embeddings = model.encode(corpus_sents)
query_embeddings = model.encode(query_sents)


In [31]:
matches = semantic_search(query_embeddings, corpus_embeddings, top_k=3)

In [37]:
matches_df = (
    pd.DataFrame(zip(query_sents, matches), columns=['query_sent', 'matches'])
    .explode('matches')
)

matches_df = matches_df.join(pd.json_normalize(matches_df.matches))
matches_df['corpus_sent'] = matches_df.corpus_id.apply(lambda x: corpus_sents[x])


In [47]:
del matches_df['matches']

In [48]:
matches_df.head()

Unnamed: 0,query_sent,corpus_id,score,corpus_sent
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
1,1896 OLD JERUSALEM RD,4940,0.905154,101 S BINGHAM ST
1,1896 OLD JERUSALEM RD,4940,0.905154,101 S BINGHAM ST


In [49]:
(
    matches_df
    .groupby(by='query_sent')
    .agg({'corpus_sent': lambda x: x.tolist(), 'score': lambda x: x.tolist(), 'corpus_id': lambda x: x.tolist()})
)

Unnamed: 0_level_0,corpus_sent,score,corpus_id
query_sent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 MORSE RD,"[36 MAIN St, 36 MAIN St, 36 MAIN St]","[0.8627626299858093, 0.8627626299858093, 0.862...","[3266, 3266, 3266]"
0 N BINGHAM ST,"[426 CUTTING HILL RD, 426 CUTTING HILL RD, 426...","[0.9999999403953552, 0.9999999403953552, 0.999...","[16210, 16210, 16210]"
1 COOK RD,"[37 SHACKSBORO RD, 37 SHACKSBORO RD, 37 SHACKS...","[0.9999997615814209, 0.9999997615814209, 0.999...","[16147, 16147, 16147]"
10 DESROCHER LN,"[2719 LAKE ST, 2719 LAKE ST, 2719 LAKE ST]","[1.0, 1.0, 1.0]","[16318, 16318, 16318]"
10 HIBBARD HILL RD,"[730 TULLEY RD, 730 TULLEY RD, 730 TULLEY RD]","[0.7145191431045532, 0.7145191431045532, 0.714...","[5249, 5249, 5249]"
...,...,...,...
99 VT ROUTE 73,"[187 BARNES RD, 187 BARNES RD, 187 BARNES RD]","[1.0000001192092896, 1.0000001192092896, 1.000...","[1580, 1580, 1580]"
995 VT ROUTE 22A,"[85 CIDER MILL RD, 85 CIDER MILL RD, 85 CIDER ...","[0.8551252484321594, 0.8551252484321594, 0.855...","[5070, 5070, 5070]"
997 SHOREHAM DEPOT RD,"[1148 CIDER MILL RD, 1148 CIDER MILL RD, 1148 ...","[0.8836921453475952, 0.8836921453475952, 0.883...","[5032, 5032, 5032]"
998 N CREAM HILL RD,"[288 SCHOOL ST, 288 SCHOOL ST, 288 SCHOOL ST]","[0.8492026329040527, 0.8492026329040527, 0.849...","[16057, 16057, 16057]"


In [44]:
matches_df.to_csv(DATA_DIR / 'processed' / 'match_df.csv')

Unnamed: 0,query_sent,matches,corpus_id,score,corpus_sent
0,114 S BINGHAM ST,"{'corpus_id': 4895, 'score': 0.9999999403953552}",4895,1.000000,114 S BINGHAM ST
0,114 S BINGHAM ST,"{'corpus_id': 4940, 'score': 0.9051543474197388}",4895,1.000000,114 S BINGHAM ST
0,114 S BINGHAM ST,"{'corpus_id': 5181, 'score': 0.8740324378013611}",4895,1.000000,114 S BINGHAM ST
1,1896 OLD JERUSALEM RD,"{'corpus_id': 7790, 'score': 1.0}",4940,0.905154,101 S BINGHAM ST
1,1896 OLD JERUSALEM RD,"{'corpus_id': 7914, 'score': 0.8958316445350647}",4940,0.905154,101 S BINGHAM ST
...,...,...,...,...,...
2022,152 VT ROUTE 125,"{'corpus_id': 2844, 'score': 0.8332019448280334}",14014,1.000000,25 SAMPSON RD
2022,152 VT ROUTE 125,"{'corpus_id': 1035, 'score': 0.8295897245407104}",14014,1.000000,25 SAMPSON RD
2023,229 COW HILL RD,"{'corpus_id': 5129, 'score': 0.8957770466804504}",14249,0.849456,23 SAMPSON RD
2023,229 COW HILL RD,"{'corpus_id': 5177, 'score': 0.817196786403656}",14249,0.849456,23 SAMPSON RD
