Parse the addresses, then have a sequential matching mechanism:
1. First match on number
2. Then out of that set match on closes street

In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search


In [27]:
import numpy as np
import pandas as pd
import re
import usaddress


from address_normalize.paths import DATA_DIR

raw_dir = DATA_DIR / 'raw'

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

data = pd.read_excel(file_path, sheet_name=['E911 Data', 'Phase_I', 'Phase_II', 'Phase_III'])

df_e911 = data['E911 Data']
main_df = data['Phase_I']

In [3]:
main_df.head()

Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [4]:
df_e911['E911 ADDRESS']

0                32 BOOTH WOODS
1                        8 E St
2                  73 SCHOOL St
3                   206 MAIN St
4                   224 MAIN St
                  ...          
17430    574 ROBEROUTE YOUNG RD
17431                          
17432          151 VT R0UTE 116
17433         71 SPRING HILL Ln
17434               14 TATRO RD
Name: E911 ADDRESS, Length: 17435, dtype: object

In [5]:
sents = df_e911['E911 ADDRESS'].tolist()

In [6]:
df_e911.head()

Unnamed: 0,OBJECTID_1,E911 ADDRESS,OBJECTID,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST CITY,GRAND LIST STATE,GRAND LIST ZIPCODE,SPAN,...,GLVAL_HS,GLVAL_NR,CRHOUSPCT,MUNGL1PCT,AOEGL_HS,AOEGL_NR,SHAPESTAre,SHAPESTLen,Shape_Length,Shape_Area
0,1,32 BOOTH WOODS,18621475,STAPLES WAYNE,MCENTEE SHEILA,32 BOOTH WOODS,VERGENNES,VT,5491,663-210-10001,...,1837,0,0,1837,1837,0,1231.173523,145.763504,145.763504,1231.173529
1,2,8 E St,18621476,LINCOLN GERONIMO LLC,,1341 ARNOLD BAY ROAD,PANTON,VT,5491,663-210-10002,...,0,1755,0,1755,0,1755,532.200386,92.414705,92.414705,532.200394
2,3,73 SCHOOL St,18621477,WHITNEY GRACE,,PO BOX 541,CROTON FALLS,NY,10519,663-210-10003,...,0,2213,0,2213,0,2213,880.616867,118.855189,118.855189,880.616861
3,4,206 MAIN St,18621478,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10004,...,0,2988,0,3284,0,2988,709.60244,109.360601,109.360601,709.602436
4,5,224 MAIN St,18621479,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10005,...,0,3314,0,3682,0,3314,1242.662354,167.708389,167.708389,1242.662357


In [7]:
corpus_sents = df_e911['E911 ADDRESS'].tolist()
query_sents = main_df['ADDRESS'].tolist()
print(corpus_sents[:10])
print(query_sents[:10])

['32 BOOTH WOODS', '8 E St', '73 SCHOOL St', '206 MAIN St', '224 MAIN St', '38 W St', '23 SUNSET Dr', 'PANTON RD', '67 NEW HAVEN RD', 'PANTON RD']
['114 S BINGHAM ST', '1896 OLD JERUSALEM RD', '14 CHURCH ST', '24 PARK LN', '156 MIDDLE RD', '82 OLIVER HOWE CT', '112 SCHOOL RD', '289 CROSS RD', '466 N ORWELL RD', '79 LEMON FAIR RD']


In [16]:
usaddress.tag(corpus_sents[0])[0]

OrderedDict([('AddressNumber', '32'), ('StreetName', 'BOOTH WOODS')])

In [28]:
test = '3407A&B N 116 RD'
re.match(r'(\d+)(.*)', test).groups()

('3407', 'A&B N 116 RD')

In [50]:
def address_parser(address):
    address = address.strip().lower()
    try:
        parsed = usaddress.tag(address)[0]
        address_number = parsed['AddressNumber']
        if 'AddressNumber' not in parsed:
            address_number = ''
    except:
        # hack for now, doesn't really work, but will for my purpose
        try:
            address_number, street_name = re.match(r'(\d+)(.*)', address).groups()
        except:
            address_number = ''
            street_name = address

    street_name = address.replace(address_number, '')
    return address_number.strip(), street_name.strip()

In [51]:
corpus_dict = {}

for address in corpus_sents:
    if address.strip() != '':
        an, street_name = address_parser(address)
        if an in corpus_dict:
            if street_name not in corpus_dict:
                corpus_dict[an]['street_names'].append(street_name)
        else:
            corpus_dict[an] = {}
            corpus_dict[an]['street_names'] = [street_name]

In [52]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [53]:
for address_number in corpus_dict:
    # encoding dupes, but not big enough to matter
    corpus_dict[address_number]['embeddings'] = model.encode(corpus_dict[address_number]['street_names'])

In [54]:
print(len(corpus_dict))

3802


In [56]:
corpus_dict['8']['embeddings']

array([[-5.2162834e-02, -6.8031766e-02, -9.5437237e-05, ...,
         2.8231055e-02, -2.5067197e-02, -4.1093454e-02],
       [-4.5955621e-02,  5.6968041e-02,  2.8331166e-02, ...,
         1.6828598e-02,  1.1414270e-02, -1.3194392e-02],
       [ 5.6897841e-02, -8.1873097e-02,  1.5928879e-02, ...,
         5.7965655e-02,  1.6379334e-02, -9.5717851e-03],
       ...,
       [-5.7484242e-03, -2.5470851e-02,  2.7082184e-02, ...,
         2.6107650e-02, -6.6857100e-02, -2.9040694e-02],
       [-5.2660540e-02,  3.8511250e-02, -3.4262054e-03, ...,
         1.5467446e-02, -4.5267146e-02, -2.3198834e-02],
       [ 5.4433313e-03,  6.6981941e-02, -2.0635496e-03, ...,
         2.4285305e-02, -6.1613005e-03, -7.0992005e-03]], dtype=float32)

In [58]:
queries = []
for address in query_sents:
    address_number, street_name = address_parser(address)
    # another ineffiecinet encoding, but find for how small the dataset is
    queries.append([address, address_number, street_name, model.encode(street_name)])

print(len(queries))

2024


In [74]:
data = []
for query_address, query_address_number, query_street_name, query_embedding in queries:
    filtered_corpus = corpus_dict[address_number]
    filtered_corpus_embeddings = filtered_corpus['embeddings']
    raw_matches = semantic_search(query_embedding, filtered_corpus_embeddings, top_k=3)
    row = [query_address]
    for val in raw_matches[0]:
        # try:
        row.extend([filtered_corpus['street_names'][val['corpus_id']], val['score']])
        # except KeyError:
        #     import pdb; pdb.set_trace()

    data.append(row)

In [76]:
matches_df = pd.DataFrame(data)

In [85]:
matches_df.sort_values(by=2, ascending=False).head(25)

Unnamed: 0,0,1,2,3,4,5,6
1012,531 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1374,498 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
981,75 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
161,600 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1088,399 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
137,602 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1252,516 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1257,567 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1260,582 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755
1341,413 MAIN ST,main st,1.0,jones dock rd,0.615593,bear pond rd,0.608755


In [86]:
matches_df.to_csv(DATA_DIR / 'processed' / 'match_df.csv')

In [88]:
corpus_dict['5322']['street_names']

['route 30']

In [89]:
queries[1749]

['5322 VT ROUTE 30',
 '5322',
 'vt route 30',
 array([-2.19753925e-02, -3.74619812e-02, -1.78094872e-03,  3.73787880e-02,
        -1.11582074e-02, -1.52974641e-02, -2.75730472e-02,  1.95965283e-02,
        -1.11596137e-02, -2.26446446e-02,  8.63526464e-02,  1.19229276e-02,
         5.60440086e-02,  8.43168870e-02,  3.83202098e-02, -3.26166004e-02,
        -5.39110936e-02, -1.72099136e-02, -1.14022516e-01,  8.83972831e-03,
        -4.47025597e-02,  2.70614456e-02, -1.46039594e-02,  2.74316743e-02,
         3.92971225e-02,  1.42779099e-02, -3.00658960e-02,  2.63781883e-02,
         3.41782831e-02,  2.15785555e-03, -1.55621432e-02, -1.79004949e-02,
         5.23353368e-02,  3.60587128e-02,  1.69016425e-06, -1.02989366e-02,
        -7.02778762e-03, -1.29236151e-02, -2.90516280e-02, -3.69190648e-02,
         1.97313428e-02, -3.17007452e-02, -4.45101671e-02,  8.77045933e-03,
         3.45900888e-04,  2.25802846e-02,  5.40513359e-03, -8.61479491e-02,
        -3.28337885e-02,  4.32045013e-02, 