# NLP 08: Parse with Fuzzy Matches

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
import sys
import os
import io

import string
import re
# import itertools
# import nltk
# nltk.download('stopwords')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz as rfuzz
import jaro



In [2]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 0
        freq_dict[ngram] +=1
    return freq_dict

def calc_fuzz_df(df, column):
    row_list = []
    
    for o_i, o_v in enumerate(df[column].sort_index()):
        for m_i, m_v in enumerate(df[column].sort_index()):
            if o_i != m_i:
                dict1 = {
                    'original_index': o_i,
                    'original_value': o_v,
                    'match_index': m_i,
                    'match_value': m_v,
                    'ratio_score': rfuzz.ratio(o_v, m_v),
                    'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
                    'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
                    'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
                    'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
                }
                if (dict1['ratio_score']>60) | (dict1['partial_ratio_score']>60) | (dict1['token_sort_score']>60) | (dict1['token_set_score']>60) | (dict1['jaro_winkler_score']>0.6):
                    row_list.append(dict1)
    score_df = pd.DataFrame(row_list)
        
    return score_df

In [3]:
df = pd.read_csv('data/parsed_bahamas_addresses.csv')

df['address_wordlist'] = df['working_address'].fillna('').str.split()

freq_df = pd.DataFrame.from_dict(
    frequency_ct(df['address_wordlist'].sum()
                ), orient='index').reset_index().rename(
    columns={'index':'word', 0:'count'}).sort_values('count', ascending=False)

In [4]:
freq_df.head(10)

Unnamed: 0,word,count
9,bahamas,2324
8,nassau,2043
6,box,1484
5,po,1430
4,street,1128
2,and,627
3,shirley,489
10,suite,447
34,bay,431
14,building,329


In [5]:
fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
fuzzy_words_df['jaro_winkler_score'] = fuzzy_words_df['jaro_winkler_score']*100

#### Cities

- Nassau
- Freeport
- Marsh Harbour
- Governor's Harbour
- Windermere island
- Harbour Island
- Elbow Cay
- Treasure Cay
- Gregory Town
- Spanish Wells

#### Islands

- New Providence
- Paradise Island
- Grand Bahama
- Abaco
- Eleuthera
- Grand Island
- South Andros

In [7]:
df['address_wordlist'].str[-1].value_counts()[:20]

bahamas       2043
bahama          66
nassau          54
bahmas           5
freeport         5
street           5
bahams           5
i                4
providence       3
centre           3
lane             3
343              3
2423527291       2
bhs              2
kln              2
bahamas1         2
abaco            2
kelty            2
23               1
8188             1
Name: address_wordlist, dtype: int64

## Bahamas

Remove "bahamas" and the misspellings from the last word in the word list. I'm removing the last word only so that I don't accidentally take "bahamas" out of phrases like "Bahamas Financial Centre." I want to try to separate out such building names later.

This lists the "bahamas" matches from the fuzzy matching

In [8]:
bah_fuzz_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['ratio_score']>75)].sort_values(['ratio_score', 'original_value'], ascending=False)['match_value'].to_list()
bah_fuzz_list

['bahamasc',
 'bahamas1',
 'bahamaas',
 'bahamas6',
 'bahamasa',
 'abahamas',
 'bahama',
 'bahams',
 'bhamas',
 'bahmas',
 'ahamas',
 'bahamaspo',
 'bahaams',
 'bahanas',
 'brahmas',
 'baham',
 'hamas']

These are the matches I can get with a regular expression on the last word of the address word list.

In [55]:
bah_reg_list = df.loc[df['working_address'].str.contains(r'ba?h\w+s\w?\d?$', regex=True), 'address_wordlist'].str[-1].unique().tolist()
bah_reg_list

['bahamas',
 'bahams',
 'bhamas',
 'bahmas',
 'bahamasc',
 'bahamas1',
 'bahaams',
 'bahamaas',
 'bahanas',
 'bahamas6',
 'bahamasa',
 'abahamas']

One thing to keep in mind is that the fuzzy matches are from all unique words, which means not all of them are necessarily going to be present in the last word.

In [50]:
set(bah_fuzz_list) - set(bah_reg_list)

{'ahamas', 'baham', 'bahama', 'bahamaspo', 'brahmas', 'hamas'}

In [51]:
set(bah_reg_list) - set(bah_fuzz_list) 

{'bahamas'}

In [46]:
pd.set_option('display.max_colwidth', 1000)

In [53]:
df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in bah_reg_list else x)

0                                                                                    [annex, frederick, and, shirley, street, po, box, n4805, nassau]
1                                                                                         [suite, e2, union, court, building, po, box, n8188, nassau]
2                                                                                           [lyford, cay, house, lyford, cay, po, box, n7785, nassau]
3                                               [po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]
4                                                                             [lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]
                                                                            ...                                                                      
2253                                                                            [j, p, morgan, trust

In [56]:
df['address_country'] = df['address_wordlist'].apply(lambda x:'bahamas' if x[-1] in bah_reg_list else np.nan)
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in bah_reg_list else x)

In [62]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805, nassau]",bahamas
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188, nassau]",bahamas
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785, nassau]",bahamas
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]",bahamas
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]",bahamas


## The

"the" shows up a surprising amount. It probably isn't relevant, but I prefer to double check.

In [61]:
df['address_wordlist'].str[-1].value_counts()

nassau        1815
providence      78
bahama          76
the             69
freeport        16
              ... 
cb11323          1
44311            1
nassaus          1
north            1
massa            1
Name: address_wordlist, Length: 110, dtype: int64

I can see that when "the" is the second to last word it follows "bahamas" as in "the bahamas." As such, I can safely remove it.

In [75]:
pd.set_option('display.max_rows', 70)

In [81]:
df['working_address'].str.split().apply(lambda x: x[-2:]).value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[nassau, bahamas]         1743
[providence, bahamas]       74
[the, bahamas]              68
[grand, bahama]             54
[abaco, bahamas]            13
                          ... 
[proividence, bahamas]       1
[floor, bahamas]             1
[lane, nassau]               1
[limited, bahamas]           1
[lyford, cay]                1
Name: working_address, Length: 176, dtype: int64

In [85]:
df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in ['the'] else x)

IndexError: list index out of range

## Nassau

In [83]:
df['address_wordlist'].str[-1].value_counts()

nassau        1815
providence      78
bahama          76
the             69
freeport        16
              ... 
cb11323          1
44311            1
nassaus          1
north            1
massa            1
Name: address_wordlist, Length: 110, dtype: int64

In [70]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>70)].sort_values(['ratio_score', 'match_value'], ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
1236,8,nassau,1593,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
1188,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
1220,8,nassau,1224,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
1198,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
1214,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
1216,8,nassau,1194,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
1196,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
1212,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
1215,8,nassau,1185,nassao,83.333333,90.909091,83.333333,83.333333,93.333333
1217,8,nassau,1201,nassan,83.333333,90.909091,83.333333,83.333333,93.333333


In [67]:
nas_ratio_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>70), 'match_value'].to_list()
nas_jaro_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['jaro_winkler_score']>85), 'match_value'].to_list()
pd.DataFrame([nas_ratio_list, nas_jaro_list]).T

Unnamed: 0,0,1
0,nasssau,nasssau
1,nasau,nasau
2,nassaub,nassaub
3,343nassau,343nassau
4,massau,massau
5,nassua,nassua
6,nassaau,nassaau
7,nassao,nassao
8,nassu,nassu
9,nassan,nassan


### Setting the threshold

The options above 79 look good for the `ratio` look good. In examining the values in the data, I see that "massa" is actually an address in Italy and I'll need to separate "343nassau" into two parts to retain the rest of the address.

In [68]:
for word in nas_ratio_list:
    print(word)
    display(df[df['address_wordlist'].apply(lambda x: word in x)])
    print('\n')

nasssau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
420,24000421,"3RD FLOOR, GEORGE HOUSE, GEORGE STREET, P.O. BOX N-8159 NASSSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,third floor george house george street po box n8159 nasssau bahamas,"[third, floor, george, house, george, street, po, box, n8159, nasssau]",bahamas




nasau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
536,14000678,"101 East Hill Street, Nasau Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,101 east hill street nasau bahamas,"[101, east, hill, street, nasau]",bahamas
612,14030188,Bahamas Financial Centre; Shirley & Charlotte Street; Fourth Flor Nasau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahamas financial centre shirley and charlotte street fourth flor nasau bahamas,"[bahamas, financial, centre, shirley, and, charlotte, street, fourth, flor, nasau]",bahamas
682,14035228,"CB 11-343 Nasau, Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343 nasau bahamas,"[cb, 11, 343, nasau]",bahamas
724,14038327,Elizabeth Avenue and Shirley Street; Union Court Building; Suite E-2; N-8188; Nasau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,elizabeth avenue and shirley street union court building suite e 2 n 8188 nasau bahamas,"[elizabeth, avenue, and, shirley, street, union, court, building, suite, e, 2, n, 8188, nasau]",bahamas
965,14078960,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nasau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nasau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nasau, the]",bahamas
1440,239867,"UBS Trustees (Bahamas) Ltd, UBS House, East Bay Street, P. O. Box N-7757, Nasau, Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,ubs trustees bahamas ltd ubs house east bay street p o box n7757 nasau bahamas,"[ubs, trustees, bahamas, ltd, ubs, house, east, bay, street, p, o, box, n7757, nasau]",bahamas




nassaub


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
583,14026897,"ANSBACHER (BAHAMAS) LIMITED P.O. BOX N 7768 ANSBACHER HOUSE BANK LANE NASSAUB, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,ansbacher bahamas limited po box n 7768 ansbacher house bank lane nassaub bahamas,"[ansbacher, bahamas, limited, po, box, n, 7768, ansbacher, house, bank, lane, nassaub]",bahamas




343nassau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
681,14035227,CB 11.343/Nassau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343nassau bahamas,"[cb, 11, 343nassau]",bahamas




massau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
810,14049672,"MASSAU, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,massau bahamas,[massau],bahamas




nassua


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
933,14077075,SAFFREY SQUARE; SUITE 205; BANK LANE; P.O. BOX N-8188; NASSUA; BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,saffrey square suite 205 bank lane po box n8188 nassua bahamas,"[saffrey, square, suite, 205, bank, lane, po, box, n8188, nassua]",bahamas
969,14078964,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nassua; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassua the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassua, the]",bahamas




nassaau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
966,14078961,Suite 102; Saffrey Square; Bay Street and Bank Lane Nassaau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassaau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassaau, the]",bahamas




nassao


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
999,14079956,Suite E-2; Union Court Buiding; Elizabeth Avenue and Shirley Streer; P.O. Box N-8188; Nassao; Bahamas.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e2 union court buiding elizabeth avenue and shirley streer po box n8188 nassao bahamas,"[suite, e2, union, court, buiding, elizabeth, avenue, and, shirley, streer, po, box, n8188, nassao]",bahamas




nassu


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1021,14079979,Suite E-2; Union Court Building; Elizabeth Avenue & Shirley Street; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street nassu bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, nassu]",bahamas
1117,14080667,The Bahamas Financial Centre; Shirley and Charlotte Streets; P.O. Box N-3023; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassu bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassu]",bahamas




nassan


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1043,14080003,Suite E-2; Union Court Building; Elizabeth Avenue and Shirley Street; PO Box 8188; Nassan; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street po box 8188 nassan bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, po, box, 8188, nassan]",bahamas
1050,14080011,Suite E-2; Union Court Building; Elizabeth Avenue and Shirly Street; Nassan; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirly street nassan the bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirly, street, nassan, the]",bahamas




nassaus


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1106,14080656,The Bahamas Financial Centre; Shirley & Charlotte Streets; PO Box N-3023; Nassaus; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassaus bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassaus]",bahamas




massa


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1150,14083823,VIA BIGINI; 43; I-54100 MASSA,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,via bigini 43 i 54100 massa,"[via, bigini, 43, i, 54100, massa]",




naussau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1428,252371,"43 Elizabeth Avenue, P.O.Box CB-13022 Naussau Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,43 elizabeth avenue po box cb13022 naussau bahamas,"[43, elizabeth, avenue, po, box, cb13022, naussau]",bahamas






I went back and updated the original standardization. I left the example where I found it.

In [86]:
df.loc[df['working_address'].str.contains("343nass"), 'working_address'] = 'cb 11 343 nassau bahamas'

In [89]:
nas_fuzz_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>80), 'match_value'].to_list()

['nasssau',
 'nasau',
 'nassaub',
 'massau',
 'nassua',
 'nassaau',
 'nassao',
 'nassu',
 'nassan',
 'nassaus',
 'naussau']

In [None]:
df['address_city'] = df['address_wordlist'].apply(lambda x: 'nassau' if x[-1] in nas_fuzz_list else np.nan)
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in nas_fuzz_list else x)