# NLP 08: Parse with Fuzzy Matches

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
import sys
import os
import io

import string
import re
# import itertools
# import nltk
# nltk.download('stopwords')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz as rfuzz
import jaro

In [11]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 0
        freq_dict[ngram] +=1
    return freq_dict

def calc_fuzz_df(df, column):
    row_list = []
    
    for o_i, o_v in enumerate(df[column].sort_index()):
        for m_i, m_v in enumerate(df[column].sort_index()):
            if o_i != m_i:
                dict1 = {
                    'original_index': o_i,
                    'original_value': o_v,
                    'match_index': m_i,
                    'match_value': m_v,
                    'ratio_score': rfuzz.ratio(o_v, m_v),
                    'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
                    'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
                    'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
                    'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
                }
                if (dict1['ratio_score']>60) | (dict1['partial_ratio_score']>60) | (dict1['token_sort_score']>60) | (dict1['token_set_score']>60) | (dict1['jaro_winkler_score']>0.6):
                    row_list.append(dict1)
    score_df = pd.DataFrame(row_list)
        
    return score_df

In [88]:
df = pd.read_csv('data/parsed_bahamas_addresses.csv')

df['address_wordlist'] = df['working_address'].fillna('').str.split()

freq_df = pd.DataFrame.from_dict(
    frequency_ct(df['address_wordlist'].sum()
                ), orient='index').reset_index().rename(
    columns={'index':'word', 0:'count'}).sort_values('count', ascending=False)

In [5]:
freq_df.head(10)

Unnamed: 0,word,count
9,bahamas,2324
8,nassau,2043
6,box,1484
5,po,1430
4,street,1128
2,and,627
3,shirley,489
10,suite,447
34,bay,431
14,building,329


In [12]:
fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
fuzzy_words_df['jaro_winkler_score'] = fuzzy_words_df['jaro_winkler_score']*100

#### Cities

- Nassau
- Freeport
- Marsh Harbour
- Governor's Harbour
- Windermere island
- Harbour Island
- Elbow Cay
- Treasure Cay
- Gregory Town
- Spanish Wells

#### Islands

- New Providence
- Paradise Island
- Grand Bahama
- Abaco
- Eleuthera
- Grand Island
- South Andros

In [77]:
df['address_wordlist'].apply(lambda x: x[-1]).value_counts()[:20]

bahamas       2043
bahama          66
nassau          54
freeport         5
street           5
bahmas           5
bahams           5
i                4
providence       3
centre           3
343              3
lane             3
kelty            2
bahamas1         2
kln              2
2423527291       2
bhs              2
abaco            2
isle             1
esquare          1
Name: address_wordlist, dtype: int64

In [96]:
df['address_wordlist'].str[-1].value_counts()[:20]

bahamas       2043
bahama          66
nassau          54
freeport         5
street           5
bahmas           5
bahams           5
i                4
providence       3
centre           3
343              3
lane             3
kelty            2
bahamas1         2
kln              2
2423527291       2
bhs              2
abaco            2
isle             1
esquare          1
Name: address_wordlist, dtype: int64

In [25]:
bah_fuzz_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['ratio_score']>75)].sort_values(['ratio_score', 'original_value'], ascending=False)['match_value'].to_list()
bah_fuzz_list

['bahamasc',
 'bahamas1',
 'bahamaas',
 'bahamas6',
 'bahamasa',
 'abahamas',
 'bahama',
 'bahams',
 'bhamas',
 'bahmas',
 'ahamas',
 'bahamaspo',
 'bahaams',
 'bahanas',
 'brahmas',
 'baham',
 'hamas']

In [52]:
bah_reg_list = pd.DataFrame(df.loc[df['working_address'].str.contains('br?a?h\w+s\w*\d?$', regex=True), 'address_wordlist'].apply(lambda x: x[-1]).value_counts()).reset_index().sort_values('index')['index'].to_list()
bah_reg_list

['abahamas',
 'bahaams',
 'bahamaas',
 'bahamas',
 'bahamas1',
 'bahamas6',
 'bahamasa',
 'bahamasc',
 'bahams',
 'bahanas',
 'bahmas',
 'bhamas']

In [39]:
set(bah_fuzz_list) - set(bah_reg_list)

{'ahamas', 'baham', 'bahamaspo', 'brahmas', 'hamas'}

In [53]:
set(bah_fuzz_list) - set(bah_reg_list)

{'ahamas', 'baham', 'bahama', 'bahamaspo', 'brahmas', 'hamas'}

In [54]:
set(bah_reg_list) - set(bah_fuzz_list) 

{'bahamas'}

In [59]:
pd.set_option('display.max_colwidth', 1000)

In [65]:
df.loc[df['address_wordlist'].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']), 'address_wordlist']

0                                                                                    [annex, frederick, and, shirley, street, po, box, n4805, nassau, bahamas]
1                                                                                         [suite, e2, union, court, building, po, box, n8188, nassau, bahamas]
2                                                                                           [lyford, cay, house, lyford, cay, po, box, n7785, nassau, bahamas]
3                                               [po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau, bahamas]
4                                                                             [lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau, bahamas]
                                                                                 ...                                                                          
2253                                          

In [64]:
df.loc[df['address_wordlist'].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']), 'address_wordlist'].apply(lambda x: x[:-1])

0                                                                                    [annex, frederick, and, shirley, street, po, box, n4805, nassau]
1                                                                                         [suite, e2, union, court, building, po, box, n8188, nassau]
2                                                                                           [lyford, cay, house, lyford, cay, po, box, n7785, nassau]
3                                               [po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]
4                                                                             [lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]
                                                                            ...                                                                      
2253                                                                            [j, p, morgan, trust

In [78]:
df.loc[df['address_wordlist'].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']), 'address_wordlist'].apply(lambda x: x.pop())

0       bahamas
1       bahamas
2       bahamas
3       bahamas
4       bahamas
         ...   
2253    bahamas
2254    bahamas
2255    bahamas
2256    bahamas
2257    bahamas
Name: address_wordlist, Length: 2063, dtype: object

In [79]:
df

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805, nassau]"
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188, nassau]"
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785, nassau]"
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]"
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]"
...,...,...,...,...,...,...,...,...,...,...
2253,240492525,"J.P.MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, ZH, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 zh bahamas,"[j, p, morgan, trust, company, bahamas, limited, nassau, n, 4899, zh]"
2254,240492536,"MONTAGNE STERLINE CENTRE. EAST BAV STREET, NASSAU, COUNTRY BAHAMAS, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montagne sterline centre east bav street nassau country bahamas bahamas,"[montagne, sterline, centre, east, bav, street, nassau, country, bahamas]"
2255,240491733,"DELTEC HOUSE, LYFORD CAY, PO BOX N-3229, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,deltec house lyford cay po box n3229 nassau bahamas,"[deltec, house, lyford, cay, po, box, n3229, nassau]"
2256,240491778,"PROVIDENCE HOUSE, HAST WING, EAST HILL STREET, P.O. BOX CB-12399, NASSAU, CB-12399, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,providence house hast wing east hill street po box cb12399 nassau cb12399 bahamas,"[providence, house, hast, wing, east, hill, street, po, box, cb12399, nassau, cb12399]"


In [93]:
# Doesn't return the right count for "bahamas"
df.loc[df['working_address'].str.endswith(tuple(bah_reg_list + ['brahmas', 'hamas'])), 'address_wordlist'].apply(lambda x: x[-1]).value_counts()

bahamas     1957
bahams         5
bahmas         5
bahamas1       2
bhamas         1
bahamasc       1
bahaams        1
bahamaas       1
bahanas        1
bahamas6       1
bahamasa       1
abahamas       1
Name: address_wordlist, dtype: int64

In [97]:
df.loc[df['address_wordlist'].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']), 'address_wordlist'].str[:-1]

0                                                                                    [annex, frederick, and, shirley, street, po, box, n4805, nassau]
1                                                                                         [suite, e2, union, court, building, po, box, n8188, nassau]
2                                                                                           [lyford, cay, house, lyford, cay, po, box, n7785, nassau]
3                                               [po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]
4                                                                             [lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]
                                                                            ...                                                                      
2253                                                                            [j, p, morgan, trust

In [98]:
df.iloc[0, -1]

['annex',
 'frederick',
 'and',
 'shirley',
 'street',
 'po',
 'box',
 'n4805',
 'nassau',
 'bahamas']

In [99]:
df.loc[df['address_wordlist'
         ].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']
                ), ['address_wordlist', 'address_country']
      ] = [df.loc[df['address_wordlist'].apply(lambda x: x[-1] in bah_reg_list + ['brahmas', 'hamas']), 'address_wordlist'
                ].apply(lambda x: x[:-1]), 'bahamas']

  return array(a, dtype, copy=False, order=order)


In [102]:
df.iloc[0, -2]

['annex',
 'frederick',
 'and',
 'shirley',
 'street',
 'po',
 'box',
 'n4805',
 'nassau']

In [100]:
df

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805, nassau]",bahamas
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188, nassau]",bahamas
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785, nassau]",bahamas
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]",bahamas
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]",bahamas
...,...,...,...,...,...,...,...,...,...,...,...
2253,240492525,"J.P.MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, ZH, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 zh bahamas,"[j, p, morgan, trust, company, bahamas, limited, nassau, n, 4899, zh]",bahamas
2254,240492536,"MONTAGNE STERLINE CENTRE. EAST BAV STREET, NASSAU, COUNTRY BAHAMAS, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montagne sterline centre east bav street nassau country bahamas bahamas,"[montagne, sterline, centre, east, bav, street, nassau, country, bahamas]",bahamas
2255,240491733,"DELTEC HOUSE, LYFORD CAY, PO BOX N-3229, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,deltec house lyford cay po box n3229 nassau bahamas,"[deltec, house, lyford, cay, po, box, n3229, nassau]",bahamas
2256,240491778,"PROVIDENCE HOUSE, HAST WING, EAST HILL STREET, P.O. BOX CB-12399, NASSAU, CB-12399, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,providence house hast wing east hill street po box cb12399 nassau cb12399 bahamas,"[providence, house, hast, wing, east, hill, street, po, box, cb12399, nassau, cb12399]",bahamas


In [80]:
df['address_wordlist'].apply(lambda x: x[-1]).value_counts()

IndexError: list index out of range

In [82]:
test_list = bah_reg_list + ['brahmas', 'hamas']
test_list

['abahamas',
 'bahaams',
 'bahamaas',
 'bahamas',
 'bahamas1',
 'bahamas6',
 'bahamasa',
 'bahamasc',
 'bahams',
 'bahanas',
 'bahmas',
 'bhamas',
 'brahmas',
 'hamas']

In [84]:
test_list[-1]

'hamas'

In [85]:
test_list.pop()

'hamas'

In [86]:
test_list

['abahamas',
 'bahaams',
 'bahamaas',
 'bahamas',
 'bahamas1',
 'bahamas6',
 'bahamasa',
 'bahamasc',
 'bahams',
 'bahanas',
 'bahmas',
 'bhamas',
 'brahmas']

In [87]:
test_list[-1]

'brahmas'