Use elasticsearch to search and filter by streetnumber

In [30]:
import eland as ed
import pandas as pd
import numpy as np

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer

from address_normalize.es_client import INDEX_NAME
from address_normalize.paths import DATA_DIR

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

es = Elasticsearch(
    hosts='http://localhost:9200',
    verify_certs=False,
)

if not es.indices.exists(index=INDEX_NAME).__bool__():
    es.indices.create(index=INDEX_NAME, body=INDEX_BODY)
print(es.info())

raw_dir = DATA_DIR / 'raw'
raw_dir.mkdir(parents=True, exist_ok=True)

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

{'name': '93c9d49368c8', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Z52hyINnR-CgBAYxzCxpNA', 'version': {'number': '8.5.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '4ed5ee9afac63de92ec98f404ccbed7d3ba9584e', 'build_date': '2022-12-05T18:22:22.226119656Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [6]:
data = pd.read_excel(file_path, sheet_name=['E911 Data', 'Phase_I', 'Phase_II', 'Phase_III'])

df_e911 = data['E911 Data']
main_df = data['Phase_I']

In [2]:
main_df.head()

Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [4]:
df_e911.head()

Unnamed: 0,OBJECTID_1,E911 ADDRESS,OBJECTID,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST CITY,GRAND LIST STATE,GRAND LIST ZIPCODE,SPAN,...,GLVAL_HS,GLVAL_NR,CRHOUSPCT,MUNGL1PCT,AOEGL_HS,AOEGL_NR,SHAPESTAre,SHAPESTLen,Shape_Length,Shape_Area
0,1,32 BOOTH WOODS,18621475,STAPLES WAYNE,MCENTEE SHEILA,32 BOOTH WOODS,VERGENNES,VT,5491,663-210-10001,...,1837,0,0,1837,1837,0,1231.173523,145.763504,145.763504,1231.173529
1,2,8 E St,18621476,LINCOLN GERONIMO LLC,,1341 ARNOLD BAY ROAD,PANTON,VT,5491,663-210-10002,...,0,1755,0,1755,0,1755,532.200386,92.414705,92.414705,532.200394
2,3,73 SCHOOL St,18621477,WHITNEY GRACE,,PO BOX 541,CROTON FALLS,NY,10519,663-210-10003,...,0,2213,0,2213,0,2213,880.616867,118.855189,118.855189,880.616861
3,4,206 MAIN St,18621478,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10004,...,0,2988,0,3284,0,2988,709.60244,109.360601,109.360601,709.602436
4,5,224 MAIN St,18621479,SMALLEST CITY HOUSING LP,,PO BOX 156,VERGENNES,VT,5491,663-210-10005,...,0,3314,0,3682,0,3314,1242.662354,167.708389,167.708389,1242.662357


In [32]:
corpus_df = pd.DataFrame(df_e911['E911 ADDRESS'])
print(len(corpus_df))
corpus_df['street_number'] = corpus_df['E911 ADDRESS'].str.extract(r'(\d+[A-Z]*)\s*')
corpus_df['street_number'] = corpus_df['street_number'].str.strip()
corpus_df['street_name'] = corpus_df['E911 ADDRESS'].str.extract(r'\d*\s*(.*)')
corpus_df['street_name'] = corpus_df['street_name'].str.strip().replace('', np.nan)
corpus_df = corpus_df.dropna()
corpus_df = corpus_df.drop_duplicates()
print(len(corpus_df))

corpus_embeddings = model.encode(corpus_df.street_name.tolist())



17435
14742


In [33]:
corpus_embeddings.shape

(14742, 768)

In [21]:
from elasticsearch.helpers import bulk

Unnamed: 0,E911 ADDRESS,street_number,street_name


In [9]:
query_embeddings = model.encode(query_sents)


In [10]:
matches = semantic_search(query_embeddings, corpus_embeddings, top_k=3)

In [11]:
matches_df = (
    pd.DataFrame(zip(query_sents, matches), columns=['query_sent', 'matches'])
    .explode('matches')
)

matches_df = matches_df.join(pd.json_normalize(matches_df.matches))
matches_df['corpus_sent'] = matches_df.corpus_id.apply(lambda x: corpus_sents[x])


In [12]:
del matches_df['matches']

In [13]:
matches_df.head()

Unnamed: 0,query_sent,corpus_id,score,corpus_sent
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
0,114 S BINGHAM ST,4895,1.0,114 S BINGHAM ST
1,1896 OLD JERUSALEM RD,5181,0.985555,135 S BINGHAM ST
1,1896 OLD JERUSALEM RD,5181,0.985555,135 S BINGHAM ST


In [14]:
(
    matches_df
    .groupby(by='query_sent')
    .agg({'corpus_sent': lambda x: x.tolist(), 'score': lambda x: x.tolist(), 'corpus_id': lambda x: x.tolist()})
)

Unnamed: 0_level_0,corpus_sent,score,corpus_id
query_sent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 MORSE RD,"[3533 MAIN St, 3533 MAIN St, 3533 MAIN St]","[0.9662905931472778, 0.9662905931472778, 0.966...","[12891, 12891, 12891]"
0 N BINGHAM ST,"[426 CUTTING HILL RD, 426 CUTTING HILL RD, 426...","[1.0000001192092896, 1.0000001192092896, 1.000...","[16210, 16210, 16210]"
1 COOK RD,"[37 SHACKSBORO RD, 37 SHACKSBORO RD, 37 SHACKS...","[1.0000001192092896, 1.0000001192092896, 1.000...","[16147, 16147, 16147]"
10 DESROCHER LN,"[2719 LAKE ST, 2719 LAKE ST, 2719 LAKE ST]","[1.0000001192092896, 1.0000001192092896, 1.000...","[16318, 16318, 16318]"
10 HIBBARD HILL RD,"[510 SPERRY RD, 510 SPERRY RD, 510 SPERRY RD]","[0.9661015272140503, 0.9661015272140503, 0.966...","[5014, 5014, 5014]"
...,...,...,...
99 VT ROUTE 73,"[187 BARNES RD, 187 BARNES RD, 187 BARNES RD]","[1.000000238418579, 1.000000238418579, 1.00000...","[1581, 1581, 1581]"
995 VT ROUTE 22A,"[655 CIDER MILL RD, 655 CIDER MILL RD, 655 CID...","[0.9764605164527893, 0.9764605164527893, 0.976...","[4912, 4912, 4912]"
997 SHOREHAM DEPOT RD,"[1148 CIDER MILL RD, 1148 CIDER MILL RD, 1148 ...","[0.9766618013381958, 0.9766618013381958, 0.976...","[5032, 5032, 5032]"
998 N CREAM HILL RD,"[288 SCHOOL ST, 288 SCHOOL ST, 288 SCHOOL ST]","[0.9604057669639587, 0.9604057669639587, 0.960...","[16057, 16057, 16057]"


In [15]:
matches_df.to_csv(DATA_DIR / 'processed' / 'match_df.csv')