Parse the addresses, then have a sequential matching mechanism:
1. First match on number
2. Then out of that set match on word matches

In [1]:
import numpy as np
import pandas as pd
import re
import usaddress


from address_normalize.paths import DATA_DIR

raw_dir = DATA_DIR / 'raw'

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

data = pd.read_excel(file_path, sheet_name=['E911 Data', 'Phase_I', 'Phase_II', 'Phase_III'])

df_e911 = data['E911 Data']
main_df = data['Phase_I']

In [2]:
main_df.head()

Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [3]:
df_e911['E911 ADDRESS']

0                32 BOOTH WOODS
1                        8 E St
2                  73 SCHOOL St
3                   206 MAIN St
4                   224 MAIN St
                  ...          
17430    574 ROBEROUTE YOUNG RD
17431                          
17432          151 VT R0UTE 116
17433         71 SPRING HILL Ln
17434               14 TATRO RD
Name: E911 ADDRESS, Length: 17435, dtype: object

In [4]:
sents = df_e911['E911 ADDRESS'].tolist()

In [5]:
df_e911.head()

Unnamed: 0,OBJECTID_1,E911 ADDRESS,OBJECTID,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST CITY,GRAND LIST STATE,GRAND LIST ZIPCODE,SPAN,...,GLVAL_HS,GLVAL_NR,CRHOUSPCT,MUNGL1PCT,AOEGL_HS,AOEGL_NR,SHAPESTAre,SHAPESTLen,Shape_Length,Shape_Area
0,1,32 BOOTH WOODS,18621475,STAPLES WAYNE,MCENTEE SHEILA,32 BOOTH WOODS,VERGENNES,VT,5491,663-210-10001,...,1837,0,0,1837,1837,0,1231.173523,145.763504,145.763504,1231.173529
1,2,8 E St,18621476,LINCOLN GERONIMO LLC,,1341 ARNOLD BAY ROAD,PANTON,VT,5491,663-210-10002,...,0,1755,0,1755,0,1755,532.200386,92.414705,92.414705,532.200394
2,3,73 SCHOOL St,18621477,WHITNEY GRACE,,PO BOX 541,CROTON FALLS,NY,10519,663-210-10003,...,0,2213,0,2213,0,2213,880.616867,118.855189,118.855189,880.616861
3,4,206 MAIN St,18621478,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10004,...,0,2988,0,3284,0,2988,709.60244,109.360601,109.360601,709.602436
4,5,224 MAIN St,18621479,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10005,...,0,3314,0,3682,0,3314,1242.662354,167.708389,167.708389,1242.662357


In [6]:
corpus_sents = df_e911['E911 ADDRESS'].tolist()
query_sents = main_df['ADDRESS'].tolist()
print(corpus_sents[:10])
print(query_sents[:10])

['32 BOOTH WOODS', '8 E St', '73 SCHOOL St', '206 MAIN St', '224 MAIN St', '38 W St', '23 SUNSET Dr', 'PANTON RD', '67 NEW HAVEN RD', 'PANTON RD']
['114 S BINGHAM ST', '1896 OLD JERUSALEM RD', '14 CHURCH ST', '24 PARK LN', '156 MIDDLE RD', '82 OLIVER HOWE CT', '112 SCHOOL RD', '289 CROSS RD', '466 N ORWELL RD', '79 LEMON FAIR RD']


In [7]:
usaddress.tag(corpus_sents[0])[0]

OrderedDict([('AddressNumber', '32'), ('StreetName', 'BOOTH WOODS')])

In [8]:
test = '3407A&B N 116 RD'
re.match(r'(\d+)(.*)', test).groups()

('3407', 'A&B N 116 RD')

In [9]:
def address_parser(address):
    address = address.strip().lower()
    try:
        parsed = usaddress.tag(address)[0]
        address_number = parsed['AddressNumber']
        if 'AddressNumber' not in parsed:
            address_number = ''
    except:
        # hack for now, doesn't really work, but will for my purpose
        try:
            address_number, street_name = re.match(r'(\d+)(.*)', address).groups()
        except:
            address_number = ''
            street_name = address

    street_name = address.replace(address_number, '')
    return address_number.strip(), street_name.strip()

In [10]:
corpus_dict = {}

for address in corpus_sents:
    if address.strip() != '':
        an, street_name = address_parser(address)
        if an in corpus_dict:
            if street_name not in corpus_dict:
                corpus_dict[an]['street_names'].append(street_name)
                corpus_dict[an]['full_address'].append(address)

        else:
            corpus_dict[an] = {}
            corpus_dict[an]['street_names'] = [street_name]
            corpus_dict[an]['full_address'] = [address]
        

In [11]:
print(len(corpus_dict))

3802


In [12]:
queries = []
for address in query_sents:
    address_number, street_name = address_parser(address)
    # another ineffiecinet encoding, but find for how small the dataset is
    queries.append([address, address_number, street_name])

print(len(queries))

2024


In [13]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [14]:
query_address, query_address_number, query_street_names = queries[0]
filtered_corpus = corpus_dict[address_number]
jaccard_similarity(query_street_names.split(), filtered_corpus['street_names'][0].split())



0.25

In [32]:

data = []
for query_address, query_address_number, query_street_names in queries:
    try:
        filtered_corpus = corpus_dict[query_address_number]
    except KeyError:
        print(f"No address number match for {query_address}")
    else:
        scores = []
        for street_name in filtered_corpus['street_names']:
            scores.append(jaccard_similarity(query_street_names.split(), street_name.split()))

        scores = np.array(scores)
        top_arg = scores.argmax()
        row = [query_address, scores[top_arg], filtered_corpus['full_address'][top_arg]]

        # if query_address_number == '5322':
        #     import pdb; pdb.set_trace()

        data.append(row)

No address number match for 6335 VT ROUTE 22A
No address number match for 6326 VT ROUTE 22A
No address number match for 4823 VT ROUTE 74 W
No address number match for 4821 VT ROUTE 74 W
No address number match for 4831 VT ROUTE 74 W
No address number match for 4718 VT ROUTE 74 W
No address number match for 5866 VT ROUTE 22A
No address number match for 3597 VT ROUTE 74 W
No address number match for 3593 VT ROUTE 74 W
No address number match for 5834 VT ROUTE 22A
No address number match for 3591 VT ROUTE 74 W
No address number match for 5625 VT ROUTE 22A
No address number match for 2973 HEMENWAY HILL RD
No address number match for 3474 RICHVILLE RD
No address number match for 3539 RICHVILLE RD
No address number match for 1358 N ORWELL RD
No address number match for 1990 RICHVILLE RD
No address number match for 1142 SCHOOL ST
No address number match for 1300 SMITH ST
No address number match for 3121 WATCH POINT RD
No address number match for 2390 LELAND RD
No address number match for 3327

In [33]:
matches_df = pd.DataFrame(data, columns=['phase_1_address', 'score', 'e911_address'])
matches_df.to_csv(DATA_DIR / 'processed' / 'match_df.csv', index=False)

In [35]:
matches_df.head()

Unnamed: 0,phase_1_address,score,e911_address
0,114 S BINGHAM ST,1.0,114 S BINGHAM ST
1,1896 OLD JERUSALEM RD,1.0,1896 OLD JERUSALEM RD
2,14 CHURCH ST,1.0,14 CHURCH ST
3,24 PARK LN,1.0,24 PARK Ln
4,156 MIDDLE RD,1.0,156 MIDDLE RD


In [36]:
matches_df.sort_values(by='score', ascending=False).head(25)

Unnamed: 0,phase_1_address,score,e911_address
0,114 S BINGHAM ST,1.0,114 S BINGHAM ST
756,410 DELONG LN,1.0,410 DELONG LN
762,19 N BINGHAM ST,1.0,19 N BINGHAM ST
761,125 MORSE RD,1.0,125 MORSE RD
760,1402 SPERRY RD,1.0,1402 SPERRY RD
759,1027 HALLADAY RD,1.0,1027 HALLADAY RD
758,732 N CREAM HILL RD,1.0,732 N CREAM HILL RD
757,685 LAPHAM BAY RD,1.0,685 LAPHAM BAY RD
755,241 SCHOOL ST,1.0,241 SCHOOL ST
764,620 N CREAM HILL RD,1.0,620 N CREAM HILL RD


In [18]:
a=corpus_dict['5322']

In [19]:
a

{'street_names': ['route 30'], 'full_address': ['5322 ROUTE 30']}

In [20]:
corpus_dict['5322']['street_names']

['route 30']

In [21]:
jaccard_similarity(queries[1749][2].split(), corpus_dict['5322']['street_names'][0].split())

0.6666666666666666