In [1]:
import numpy as np
import pandas as pd

from address_normalize.paths import DATA_DIR

raw_dir = DATA_DIR / 'raw'

In [2]:
files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

In [3]:
data = pd.read_excel(file_path, sheet_name=['E911 Data', 'Phase_I', 'Phase_II', 'Phase_III'])

In [4]:
df_e911 = data['E911 Data']
main_df = data['Phase_I']

In [5]:
main_df.head()

Unnamed: 0,OBJECTID,REF,ESITEID,Build Phase,ADDRESS,E911 ADDRESS,OWNER 1,OWNER 2,GRAND LIST ADDRESS,GRAND LIST = ADDRESS,...,COL_R,COL_S,COL_T,COL_U,COL_V,COL_W,COL_X,COL_Y,GNS Area,Construction Status
0,1547,1547,78274,Phase 1,114 S BINGHAM ST,114 S BINGHAM ST,114 S BINGHAM LLC,(QUINTTUS),15 ASHLAND AVE,N,...,,,,,,,,,,Included
1,1316,1316,139583,Phase 1,1896 OLD JERUSALEM RD,1896 OLD JERUSALEM RD,80 GOATS INC,,2001 OLD JERUSALEM RD,N,...,,,,,,,,1.0,,Included
2,189,189,184321,Phase 1,14 CHURCH ST,14 CHURCH ST,ABBOTT JACOB,ABBOTT KAREN LYNN,14 CHURCH ST,Y,...,,,,,,,,,,Included
3,1382,1382,78171,Phase 1,24 PARK LN,24 PARK Ln,ABEL WILLA & SIMON,,24 PARK LN,Y,...,,,,,,,,,GoNetSpeed,Excluded
4,328,328,216370,Phase 1,156 MIDDLE RD,156 MIDDLE RD,ACCIAVIATTI BRUCE,SMITH COLLEEN,156 MIDDLE RD,Y,...,1.0,,,,,,,1.0,,Included


In [6]:
df_e911.columns

Index(['OBJECTID_1', 'E911 ADDRESS', 'OBJECTID', 'OWNER 1', 'OWNER 2',
       'GRAND LIST ADDRESS ', 'GRAND LIST CITY', 'GRAND LIST STATE',
       'GRAND LIST ZIPCODE', 'SPAN', 'GLIST_SPAN', 'MAPID', 'PARCID',
       'PROPTYPE', 'YEAR', 'GLYEAR', 'TOWN', 'TNAME', 'SOURCENAME',
       'SOURCETYPE', 'SOURCEDATE', 'EDITMETHOD', 'EDITOR', 'EDITDATE',
       'MATCHSTAT', 'EDITNOTE', 'ADDRGL2', 'DESCPROP', 'LOCAPROP', 'CAT',
       'RESCODE', 'ACRESGL', 'REAL_FLV', 'HSTED_FLV', 'NRES_FLV', 'LAND_LV',
       'IMPRV_LV', 'EQUIPVAL', 'EQUIPCODE', 'INVENVAL', 'HSDECL', 'HSITEVAL',
       'VETEXAMT', 'EXPDESC', 'ENDDATE', 'STATUTE', 'EXAMT_HS', 'EXAMT_NR',
       'UVREDUC_HS', 'UVREDUC_NR', 'GLVAL_HS', 'GLVAL_NR', 'CRHOUSPCT',
       'MUNGL1PCT', 'AOEGL_HS', 'AOEGL_NR', 'SHAPESTAre', 'SHAPESTLen',
       'Shape_Length', 'Shape_Area'],
      dtype='object')

In [7]:
df_e911['E911 ADDRESS']

0                32 BOOTH WOODS
1                        8 E St
2                  73 SCHOOL St
3                   206 MAIN St
4                   224 MAIN St
                  ...          
17430    574 ROBEROUTE YOUNG RD
17431                          
17432          151 VT R0UTE 116
17433         71 SPRING HILL Ln
17434               14 TATRO RD
Name: E911 ADDRESS, Length: 17435, dtype: object

In [8]:
sents = df_e911['E911 ADDRESS'].tolist()

In [63]:
from gensim.utils import tokenize

cleaned_sents = []
for address in sents:
    temp = ''.join([char.lower() for char in address if not char.isdigit()]).strip()
    if len(temp) > 0:
        cleaned_sents.append(temp)

class GenerateSentences:
    def __iter__(self):
        for wrds in cleaned_sents:
            yield list(tokenize(wrds))

In [64]:
from gensim.models.fasttext import FastText

VEC_SIZE = 4

model = FastText(vector_size=VEC_SIZE, window=3, min_count=1)

# build the vocabulary
model.build_vocab(corpus_iterable=GenerateSentences())

# train the model
model.train(
    corpus_iterable=GenerateSentences(), epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model)

FastText<vocab=1055, vector_size=4, alpha=0.025>


In [65]:
wv = model.wv
print(wv)

FastTextKeyedVectors<vector_size=4, 1055 keys>


In [66]:
any([len(b.strip())==0 for b in cleaned_sents])

False

In [67]:
from gensim.models.keyedvectors import KeyedVectors

address_vectors = KeyedVectors(vector_size=VEC_SIZE, count=len(cleaned_sents))

emb_vector = np.empty((len(cleaned_sents), VEC_SIZE))
for idx, address in enumerate(cleaned_sents):
    emb_vector[idx, :] = wv.get_sentence_vector(address)

address_vectors.add_vectors(cleaned_sents, emb_vector, replace=True)

In [68]:
wv.get_sentence_vector('114 S BINGHAM ST')

array([ 0.11017409, -0.0866404 ,  0.07866299, -0.2764223 ], dtype=float32)

In [75]:
address_vectors.most_similar(wv.get_sentence_vector('fisher hill rd'))

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


[('fisher hill rd', 1.0),
 ('off churchviille rd', 0.940300464630127),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681),
 ('bishop hill rd', 0.9382767081260681)]

In [74]:
address_vectors.get_index('fisher hill road')

KeyError: "Key 'fisher hill road' not present"