In [1]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein
# !pip install rapidfuzz
# !pip install jaro-winkler

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
import sys
import os
import io

import string
import re
# import itertools
# import nltk
# nltk.download('stopwords')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz as rfuzz
import jaro

In [74]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 50)

In [3]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 0
        freq_dict[ngram] +=1
    return freq_dict

In [4]:
df = pd.read_csv('data/parsed_bahamas_addresses.csv')

In [5]:
df.shape

(2258, 9)

In [6]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas


In [7]:
df['address_wordlist'] = df['working_address'].fillna('').str.split()

In [8]:
freq_df = pd.DataFrame.from_dict(
    frequency_ct(df['address_wordlist'].sum()
                ), orient='index').reset_index().rename(
    columns={'index':'word', 0:'count'}).sort_values('count', ascending=False)

In [9]:
freq_df.shape

(2040, 2)

In [10]:
freq_df.head(60)

Unnamed: 0,word,count
9,bahamas,2321
8,nassau,2040
6,box,1484
5,po,1430
4,street,1128
2,and,627
3,shirley,489
10,suite,447
34,bay,431
14,building,329


## Fuzzy matching

In [11]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805, nassau, bahamas]"
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188, nassau, bahamas]"
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785, nassau, bahamas]"
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau, bahamas]"
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau, bahamas]"


In [12]:
print('Sample 1:', df['working_address'][0])
print('Sample 2:', df['working_address'][1])

Sample 1: annex frederick and shirley street po box n4805 nassau bahamas
Sample 2: suite e2 union court building po box n8188 nassau bahamas


### fuzzywuzzy implimentation

In [13]:
print('Ratio:', fuzz.ratio(df['working_address'][0], df['working_address'][1]))
print('Partial ratio:', fuzz.partial_ratio(df['working_address'][0], df['working_address'][1]))
print('Token sort ratio:', fuzz.token_sort_ratio(df['working_address'][0], df['working_address'][1]))
print('Token set ratio:', fuzz.token_set_ratio(df['working_address'][0], df['working_address'][1]))

Ratio: 57
Partial ratio: 58
Token sort ratio: 55
Token set ratio: 59


In [14]:
print('Ratio:', fuzz.ratio(df['working_address'][1], df['working_address'][0]))
print('Partial ratio:', fuzz.partial_ratio(df['working_address'][1], df['working_address'][0]))
print('Token sort ratio:', fuzz.token_sort_ratio(df['working_address'][1], df['working_address'][0]))
print('Token set ratio:', fuzz.token_set_ratio(df['working_address'][1], df['working_address'][0]))

Ratio: 57
Partial ratio: 58
Token sort ratio: 55
Token set ratio: 59


### rapidfuzz implimentation

In [15]:
print('Ratio:', rfuzz.ratio(df['working_address'][0], df['working_address'][1]))
print('Partial ratio:', rfuzz.partial_ratio(df['working_address'][0], df['working_address'][1]))
print('Token sort ratio:', rfuzz.token_sort_ratio(df['working_address'][0], df['working_address'][1]))
print('Token set ratio:', rfuzz.token_set_ratio(df['working_address'][0], df['working_address'][1]))

Ratio: 57.14285714285714
Partial ratio: 65.93406593406593
Token sort ratio: 55.46218487394958
Token set ratio: 58.8235294117647


In [16]:
print('Ratio:', rfuzz.ratio(df['working_address'][1], df['working_address'][0]))
print('Partial ratio:', rfuzz.partial_ratio(df['working_address'][1], df['working_address'][0]))
print('Token sort ratio:', rfuzz.token_sort_ratio(df['working_address'][1], df['working_address'][0]))
print('Token set ratio:', rfuzz.token_set_ratio(df['working_address'][1], df['working_address'][0]))

Ratio: 57.14285714285714
Partial ratio: 65.93406593406593
Token sort ratio: 55.46218487394958
Token set ratio: 58.8235294117647


### Jaro Winkler

In [17]:
print('Jaro winkler 1:', jaro.jaro_winkler_metric(df['working_address'][0], df['working_address'][1]))
print('Jaro winkler 2:', jaro.jaro_winkler_metric(df['working_address'][1], df['working_address'][0]))

Jaro winkler 1: 0.6764432336154215
Jaro winkler 2: 0.6764432336154215


In [18]:
goodmans_series = df.loc[df['working_address'].str.contains('bay corporate'), 'working_address']
goodmans_series

9                                               ground floor goodmans bay corporate ce po box n 3933 nassau bahamas
63                                               goodmans bay corporate centre west bay po box n3015 nassau bahamas
100                                     goodmans bay corporate centre po box cb10976 west bay street nassau bahamas
116                                           goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas
248                            goodmans bay corporate centre po box ss5498 suite 261 west bay street nassau bahamas
268                                                     goodmans bay corporate centre po box cb12407 nassau bahamas
548                              second floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas
756                                      goodman0s bay corporate centre west bay street po box n4938 nassau bahamas
758                                       goodmans bay corporate center 

In [19]:
goodmans_series.shape

(37,)

In [20]:
goodmans_series.iloc[0]

'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'

### process.extract

In [20]:
process.extract(goodmans_series.iloc[0], goodmans_series, limit=10)

[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('second floor goodmans bay corporate centre', 86, 1197),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  86,
  1800),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  86,
  2133),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 81, 268),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  81,
  548),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 81, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  79,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  79,
  2120)]

In [21]:
for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio, fuzz.token_set_ratio, jaro.jaro_winkler_metric]:
    print(str(scorer))
    display(process.extract(goodmans_series.iloc[0], goodmans_series, scorer=scorer, limit=40))
    print('\n')

<function ratio at 0x7fbebd75d5e0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 78, 2068),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  77,
  548),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 76, 268),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1610),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  76,
  1707),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75,
  2120),
 ('goodmans bay corporate cen



<function partial_ratio at 0x7fbebd75d820>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 87, 2068),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 85, 268),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  63),
 ('second floor goodmans bay corporate centre', 83, 1197),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  2120),
 ('goodmans bay corporate centre west bay street  nassau  bahamas', 78, 765),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  77,
  1707),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  76,
  2133),
 ('goodmans bay corporate centre west bay street nassau the bahamas', 75, 766),
 ('co third fl goodmans bay corporate centre west bay street nassau b



<function token_sort_ratio at 0x7fbebd75da60>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 75, 268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 75, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  74,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  74,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  73,
  1707),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  72,
  548),
 ('goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  71,
  116),
 ('goodmans bay corporate centre third floor west bay street nassau bahamas',
  71,
  762),
 ('goodmans bay corporate centre first floor west bay street nassau bahamas',
  71,
  1667),
 ('goodman0s bay corporate centre wes



<function token_set_ratio at 0x7fbebd75dca0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 85, 268),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  85,
  548),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  85,
  1800),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 85, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  81,
  1707),
 ('goodmans bay corporate centre west bay street  nassau  bahamas', 80, 765),
 ('second floor goodmans bay corporate centre', 80, 1197),
 ('goodmans bay corporate centre po box cb10976 west



<function jaro_winkler_metric at 0x7fbebd767310>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  0.7761178851505186,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  0.775915536971176,
  2068),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1610),
 ('second floor goodmans bay corporate centre', 0.7660186839291317, 1197),
 ('goodmans bay corporate center west bay street po box n4938 nassau bahamas',
  0.7631546664657964,
  1524),
 ('goodman0s bay corporate centre west bay street po box n4938 nassau bahamas',
  0.7599891722044119,
  756),
 ('goodmans b





In [22]:
for scorer in [rfuzz.ratio, rfuzz.partial_ratio, rfuzz.token_sort_ratio, rfuzz.token_set_ratio, jaro.jaro_winkler_metric]:
    print(str(scorer))
    display(process.extract(goodmans_series.iloc[0], goodmans_series, scorer=scorer, limit=40))
    print('\n')

<cyfunction ratio at 0x7fbebd70e860>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  77.77777777777779,
  2068),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  77.33333333333333,
  548),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  76.19047619047619,
  268),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  76.11940298507463,
  1707),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1610),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75.1879699



<cyfunction partial_ratio at 0x7fbebd70e930>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('second floor goodmans bay corporate centre', 87.5, 1197),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  86.72566371681415,
  2068),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  84.95575221238938,
  268),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.33333333333334,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.33333333333334,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  78.125,
  1707),
 ('goodmans bay corporate centre west bay street  nassau  bahamas',
  77.58620689655173,
  765),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  76.11940298507463,
  2133),
 ('co third fl goodmans bay corporate



<cyfunction token_sort_ratio at 0x7fbebd70ead0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  74.60317460317461,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  74.60317460317461,
  2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  73.6842105263158,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  73.6842105263158,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  73.13432835820896,
  1707),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  72.48322147651007,
  548),
 ('goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  70.58823529411764,
  116),
 ('goodmans bay corporate centre third floor west bay street nassau bahamas',
  70.50359712230217,
  762),




<cyfunction token_set_ratio at 0x7fbebd70eba0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  85.47008547008546,
  548),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  85.47008547008546,
  1800),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  85.4368932038835,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  85.4368932038835,
  2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.01886792452831,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.01886792452831,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  81.13207547169812,
  1707),
 ('second floor goodmans bay corporate centre', 80.0, 1197),
 ('goodmans b



<function jaro_winkler_metric at 0x7fbebd767310>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  0.7761178851505186,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  0.775915536971176,
  2068),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1610),
 ('second floor goodmans bay corporate centre', 0.7660186839291317, 1197),
 ('goodmans bay corporate center west bay street po box n4938 nassau bahamas',
  0.7631546664657964,
  1524),
 ('goodman0s bay corporate centre west bay street po box n4938 nassau bahamas',
  0.7599891722044119,
  756),
 ('goodmans b





## Solutioning

I'm trying to use fuzzy matching to identify and resolve duplicates. Having not worked with this before, I want to see the match values so that I can determine an appropriate threshold. To do this, I need to come up with a way to process the values, store the data, and analyize it.

### Metrics

The Goodman's Bay Corporate Centre example above returned the following ranges for each metric which make me think that some metrics are better for certain use cases than others:

**Fuzzywuzzy**

- Ratio: 100 - 58
- Partial ratio: 100 - 51
- Token sort: 100 - 60
- Token set: 100 - 66
- Jaro-Winkler: 1.0 - 0.65

**Rapidfuzz**

- Ratio: 100 - 57.8
- Partial ratio: 100 - 58.6
- Token sort: 100 - 59.8
- Token set: 100 - 65.7
- Jaro-Winkler: 1.0 - 0.65

### Storage format

From the Goodman's Bay Corporate Centre results, the relevant information I think I'll need includes:

<table>
    <tr>
        <td>address_index</td>
        <td>address</td>
        <td>match_index</td>
        <td>match</td>
        <td>ratio_score</td>
        <td>partial_ratio_score</td>
        <td>token_sort_score</td>
        <td>token_set_score</td>
        <td>jaro_winkler_score</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1975</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>100</td>
        <td>100</td>
        <td>100</td>
        <td>100</td>
        <td>1.0</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>2068</td>
        <td>'goodmans bay corporate centre po box cb10976 nassau bahamas'</td>
        <td>78</td>
        <td>87</td>
        <td>75</td>
        <td>85</td>
        <td>0.78</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>548</td>
        <td>'second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas'</td>
        <td>77</td>
        <td>69</td>
        <td>72</td>
        <td>85</td>
        <td>0.74</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>268</td>
        <td>'goodmans bay corporate centre po box cb12407 nassau bahamas'</td>
        <td>76</td>
        <td>85</td>
        <td>75</td>
        <td>85</td>
        <td>0.78</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1511</td>
        <td>'goodmans bay corporate centre west bay street po box n3933 nassau bahamas'</td>
        <td>76</td>
        <td>70</td>
        <td>70</td>
        <td>79</td>
        <td>0.77</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1707</td>
        <td>'first floor goodmans bay corporate centre bay street nassau bahamas'</td>
        <td>76</td>
        <td>77</td>
        <td>73</td>
        <td>81</td>
        <td>0.73</td>
    </tr>
</table>

The next question is: how do I get all of this information into a dataframe?

The `process.extract` function I tried earlier limits on the number of results returned, not the quality of results. With the address datasets I'll be working with, there are potentially large groups of related data (especially when I get into countries with large representation in the original dataset). Because of this, I won't be using the `process.extract` function. I'm going to prototype using a smaller, easier dataset: the word frequency dataset.

In [21]:
freq_df.sort_index()

Unnamed: 0,word,count
0,annex,8
1,frederick,30
2,and,627
3,shirley,489
4,street,1128
...,...,...
2035,sterline,1
2036,bav,1
2037,hast,1
2038,coj,1


In [24]:
freq_df['word'].sort_index()[:5]

0        annex
1    frederick
2          and
3      shirley
4       street
Name: word, dtype: object

In [25]:
print(f'''Original index: {0}
Original value: {freq_df.iloc[0, 0]}
Match index: {1}
Match value: {freq_df.iloc[1, 0]}
Metric output: {rfuzz.ratio(freq_df.iloc[0, 0], freq_df.iloc[1, 0])}
''')

Original index: 0
Original value: bahamas
Match index: 1
Match value: nassau
Metric output: 30.76923076923077



In [26]:
for o_i, o_v in enumerate(freq_df['word'].sort_index()[:5]):
    for m_i, m_v in enumerate(freq_df['word'].sort_index()[:5]):
        if o_i != m_i:
            print([o_i, o_v, m_i, m_v, rfuzz.ratio(o_v, m_v), rfuzz.partial_ratio(o_v, m_v), rfuzz.token_sort_ratio(o_v, m_v), rfuzz.token_set_ratio(o_v, m_v), jaro.jaro_winkler_metric(o_v, m_v)])
            

[0, 'annex', 1, 'frederick', 14.28571428571429, 25.0, 14.28571428571429, 14.285714285714292, 0.43703703703703706]
[0, 'annex', 2, 'and', 50.0, 80.0, 50.0, 50.0, 0.6888888888888888]
[0, 'annex', 3, 'shirley', 16.666666666666664, 28.57142857142857, 16.666666666666664, 16.66666666666667, 0.44761904761904764]
[0, 'annex', 4, 'street', 18.181818181818176, 28.57142857142857, 18.181818181818176, 18.181818181818187, 0.45555555555555555]
[1, 'frederick', 0, 'annex', 14.28571428571429, 25.0, 14.28571428571429, 14.285714285714292, 0.43703703703703706]
[1, 'frederick', 2, 'and', 16.666666666666664, 33.333333333333336, 16.666666666666664, 16.66666666666667, 0.48148148148148145]
[1, 'frederick', 3, 'shirley', 25.0, 40.0, 25.0, 25.0, 0.5026455026455027]
[1, 'frederick', 4, 'street', 40.0, 54.54545454545454, 40.0, 40.0, 0.611111111111111]
[2, 'and', 0, 'annex', 50.0, 80.0, 50.0, 50.0, 0.6888888888888888]
[2, 'and', 1, 'frederick', 16.666666666666664, 33.333333333333336, 16.666666666666664, 16.66666666

In [93]:
# def calc_fuzz_df(df, column):
#     row_list = []
    
#     for o_i, o_v in enumerate(df[column].sort_index()):
#         for m_i, m_v in enumerate(df[column].sort_index()):
#             if o_i != m_i:
#                 dict1 = {
#                     'original_index': o_i,
#                     'original_value': o_v,
#                     'match_index': m_i,
#                     'match_value': m_v,
#                     'ratio_score': rfuzz.ratio(o_v, m_v),
#                     'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
#                     'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
#                     'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
#                     'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
#                 }
#                 row_list.append(dict1)
#     score_df = pd.DataFrame(row_list)
        
#     return score_df

In [94]:
# fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
# fuzzy_words_df

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
0,0,annex,1,frederick,14.285714,25.000000,14.285714,14.285714,0.437037
1,0,annex,2,and,50.000000,80.000000,50.000000,50.000000,0.688889
2,0,annex,3,shirley,16.666667,28.571429,16.666667,16.666667,0.447619
3,0,annex,4,street,18.181818,28.571429,18.181818,18.181818,0.455556
4,0,annex,5,po,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
4370185,2090,2ntl,2085,montagne,33.333333,50.000000,33.333333,33.333333,0.583333
4370186,2090,2ntl,2086,sterline,33.333333,50.000000,33.333333,33.333333,0.583333
4370187,2090,2ntl,2087,bav,0.000000,0.000000,0.000000,0.000000,0.000000
4370188,2090,2ntl,2088,hast,25.000000,40.000000,25.000000,25.000000,0.500000


In [95]:
# fuzzy_words_df.describe()

Unnamed: 0,original_index,match_index,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
count,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0
mean,1045.0,1045.0,14.00943,22.4697,14.00803,14.00794,0.2240348
std,603.6197,603.6197,15.02418,22.66403,15.02212,15.02198,0.2557097
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,522.0,522.0,0.0,0.0,0.0,0.0,0.0
50%,1045.0,1045.0,14.28571,25.0,14.28571,14.28571,0.0
75%,1568.0,1568.0,23.52941,40.0,23.52941,23.52941,0.4666667
max,2090.0,2090.0,96.2963,100.0,100.0,100.0,0.9857143


In [97]:
# fuzzy_words_df[(fuzzy_words_df['ratio_score']==0) & (fuzzy_words_df['partial_ratio_score']==0) & (fuzzy_words_df['token_sort_score']==0) & (fuzzy_words_df['token_set_score']==0) & (fuzzy_words_df['jaro_winkler_score']==0)]

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
4,0,annex,5,po,0.0,0.0,0.0,0.0,0.0
11,0,annex,12,court,0.0,0.0,0.0,0.0,0.0
14,0,annex,15,lyford,0.0,0.0,0.0,0.0,0.0
22,0,annex,23,third,0.0,0.0,0.0,0.0,0.0
23,0,annex,24,floor,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4370177,2090,2ntl,2077,rbc,0.0,0.0,0.0,0.0,0.0
4370182,2090,2ntl,2082,hamas,0.0,0.0,0.0,0.0,0.0
4370184,2090,2ntl,2084,zh,0.0,0.0,0.0,0.0,0.0
4370187,2090,2ntl,2087,bav,0.0,0.0,0.0,0.0,0.0


In [22]:
def calc_fuzz_df(df, column):
    row_list = []
    
    for o_i, o_v in enumerate(df[column].sort_index()):
        for m_i, m_v in enumerate(df[column].sort_index()):
            if o_i != m_i:
                dict1 = {
                    'original_index': o_i,
                    'original_value': o_v,
                    'match_index': m_i,
                    'match_value': m_v,
                    'ratio_score': rfuzz.ratio(o_v, m_v),
                    'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
                    'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
                    'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
                    'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
                }
                if (dict1['ratio_score']>0) | (dict1['partial_ratio_score']>0) | (dict1['token_sort_score']>0) | (dict1['token_set_score']>0) | (dict1['jaro_winkler_score']>0):
                    row_list.append(dict1)
    score_df = pd.DataFrame(row_list)
        
    return score_df

In [23]:
fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
fuzzy_words_df['jaro_winkler_score'] = fuzzy_words_df['jaro_winkler_score']*100
fuzzy_words_df

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
0,0,annex,1,frederick,14.285714,25.000000,14.285714,14.285714,43.703704
1,0,annex,2,and,50.000000,80.000000,50.000000,50.000000,68.888889
2,0,annex,3,shirley,16.666667,28.571429,16.666667,16.666667,44.761905
3,0,annex,4,street,18.181818,28.571429,18.181818,18.181818,45.555556
4,0,annex,6,box,25.000000,50.000000,25.000000,25.000000,0.000000
...,...,...,...,...,...,...,...,...,...
2275457,2039,2ntl,2030,tortola,36.363636,50.000000,36.363636,36.363636,59.523810
2275458,2039,2ntl,2032,switzerland,26.666667,33.333333,26.666667,26.666667,56.060606
2275459,2039,2ntl,2034,montagne,33.333333,50.000000,33.333333,33.333333,58.333333
2275460,2039,2ntl,2035,sterline,33.333333,50.000000,33.333333,33.333333,58.333333


In [55]:
index_col = 'original_index'
metric_cts = pd.DataFrame(fuzzy_words_df[index_col].unique(), columns=[index_col])

for metric in ['ratio_score', 'partial_ratio_score', 'token_sort_score', 'token_set_score', 'jaro_winkler_score']:
    met_df = fuzzy_words_df.loc[fuzzy_words_df[metric]>60, [index_col, metric]].groupby(index_col).count().reset_index()
    metric_cts = metric_cts.merge(met_df, on=index_col, how='outer')

In [56]:
metric_cts = fuzzy_words_df[[index_col, 'original_value']].drop_duplicates().merge(metric_cts, on=index_col, how='outer')
metric_cts.columns = ['original_index', 'original_value', 'ratio_match_ct', 'partial_ratio_match_ct', 'token_sort_match_ct', 'token_set_match_ct', 'jaro_winkler_match_ct']
metric_cts

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct,jaro_winkler_match_ct
0,0,annex,5.0,58,5.0,5.0,55
1,1,frederick,3.0,63,3.0,3.0,67
2,2,and,14.0,239,14.0,14.0,133
3,3,shirley,19.0,60,19.0,19.0,122
4,4,street,18.0,81,18.0,18.0,179
...,...,...,...,...,...,...,...
2035,2035,sterline,21.0,127,21.0,21.0,238
2036,2036,bav,7.0,98,7.0,7.0,91
2037,2037,hast,6.0,110,6.0,6.0,114
2038,2038,coj,1.0,100,1.0,1.0,84


In [223]:
# metric_cts[metric_cts['original_value'].str.contains('^n\d+|cb\d+|no\d+|\d+$')]

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct,jaro_winkler_match_ct
7,7,n4805,97.0,238.0,97.0,97.0,355.0
14,14,n8188,66.0,225.0,66.0,66.0,314.0
18,18,n7785,81.0,198.0,81.0,81.0,274.0
19,19,n3708,89.0,271.0,89.0,89.0,374.0
25,25,n3024,100.0,267.0,100.0,100.0,385.0
26,26,303,31.0,122.0,31.0,31.0,248.0
27,27,n492,75.0,237.0,75.0,75.0,335.0
31,31,ss19084,25.0,124.0,25.0,25.0,342.0
37,37,cb12399,52.0,143.0,52.0,52.0,311.0
43,43,n4875,115.0,268.0,115.0,115.0,341.0


In [224]:
# metric_cts[metric_cts['original_value'].str.contains('^n\d+|cb\d+|no\d+|\d+$')].shape

(785, 7)

In [228]:
# metric_cts[metric_cts['original_value'].str.contains('^n\d+|cb\d+|no\d+|^\d+$')].shape

(629, 7)

In [225]:
# metric_cts[metric_cts['original_value'].str.contains('\d+$')].shape

(784, 7)

In [227]:
# metric_cts[metric_cts['original_value'].str.contains('^\d+$')].shape

(247, 7)

In [59]:
metric_cts = metric_cts[~metric_cts['original_value'].str.contains('^n\d+|^\w\w\d+|^\d+$')]
metric_cts

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct,jaro_winkler_match_ct
0,0,annex,5.0,58,5.0,5.0,55
1,1,frederick,3.0,63,3.0,3.0,67
2,2,and,14.0,239,14.0,14.0,133
3,3,shirley,19.0,60,19.0,19.0,122
4,4,street,18.0,81,18.0,18.0,179
...,...,...,...,...,...,...,...
2035,2035,sterline,21.0,127,21.0,21.0,238
2036,2036,bav,7.0,98,7.0,7.0,91
2037,2037,hast,6.0,110,6.0,6.0,114
2038,2038,coj,1.0,100,1.0,1.0,84


In [60]:
metric_cts.sort_values('ratio_match_ct', ascending=False).head(1000)

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct,jaro_winkler_match_ct
1137,1137,ste,24.0,243,24.0,24.0,149
516,516,bahams,24.0,71,24.0,24.0,83
1136,1136,slite,23.0,94,23.0,23.0,156
1580,1580,steret,23.0,113,23.0,23.0,198
662,662,center,23.0,100,23.0,23.0,189
659,659,bhamas,23.0,76,23.0,23.0,94
1226,1226,charlote,23.0,92,23.0,23.0,230
904,904,bahmas,22.0,68,22.0,22.0,94
1969,1969,stre,22.0,205,22.0,22.0,148
464,464,suites,22.0,80,22.0,22.0,169


In [28]:
metric_cts.sort_values('ratio_match_ct', ascending=False).tail(400)

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct,jaro_winkler_match_ct
971,971,bitco,13.0,155,13.0,13.0,359
765,765,ee15269,13.0,114,13.0,13.0,333
581,581,ab20763,13.0,103,13.0,13.0,285
73,73,marlborough,13.0,99,13.0,13.0,363
970,970,bit,13.0,136,13.0,13.0,232
705,705,halsbury,13.0,89,13.0,13.0,390
74,74,queen,13.0,92,13.0,13.0,314
1181,1181,2union,13.0,122,13.0,13.0,332
1259,1259,wueen,13.0,107,13.0,13.0,342
1460,1460,olimpia,13.0,98,13.0,13.0,327


In [51]:
df[df['address_wordlist'].apply(lambda x: 'st' in x)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
450,24000451,"ST. MALCOLM BUILDING, VICTORIA & BAY STS, P.O. BOX SS-6738, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,st malcolm building victoria and bay street po box ss6738 nassau bahamas,"[st, malcolm, building, victoria, and, bay, street, po, box, ss6738, nassau, bahamas]"
960,14078480,St. Andrew's Court; Frederick Street Steps; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,st andrews court frederick street steps nassau bahamas,"[st, andrews, court, frederick, street, steps, nassau, bahamas]"
961,14078481,St. Andrew's Court; Frederick Street Steps; P. O. Box N-4805; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,st andrews court frederick street steps p o box n4805 nassau bahamas,"[st, andrews, court, frederick, street, steps, p, o, box, n4805, nassau, bahamas]"
1935,33000132,"ST ANDREW'S COURT FREDERICK ST STEPS PO BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,st andrews court frederick street steps po box n4805 nassau bahamas,"[st, andrews, court, frederick, street, steps, po, box, n4805, nassau, bahamas]"
2161,120015591,"ST. ANDREW'S COURT FREDERICK STREET, STEPS, NASSAU BAHAMAS","ST. ANDREW'S COURT FREDERICK STREET, STEPS, NASSAU BAHAMAS",Bahamas,BHS,Paradise Papers - Barbados corporate registry,Barbados corporate registry data is current through 2016,,st andrews court frederick street steps nassau bahamas,"[st, andrews, court, frederick, street, steps, nassau, bahamas]"


In [54]:
df[df['address_wordlist'].apply(lambda x: 'corporate' in x)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
9,24000010,"GROUND FLOOR, GOODMAN'S BAY CORPORATE CE, P.O. BOX N 3933, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,ground floor goodmans bay corporate ce po box n 3933 nassau bahamas,"[ground, floor, goodmans, bay, corporate, ce, po, box, n, 3933, nassau, bahamas]"
63,24000064,"GOODMAN'S BAY CORPORATE CENTRE WEST BAY, P.O. BOX N-3015, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans bay corporate centre west bay po box n3015 nassau bahamas,"[goodmans, bay, corporate, centre, west, bay, po, box, n3015, nassau, bahamas]"
100,24000101,"GOODMAN'S BAY CORPORATE CENTRE, P.O. BOX CB-10976 WEST BAY STREET, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans bay corporate centre po box cb10976 west bay street nassau bahamas,"[goodmans, bay, corporate, centre, po, box, cb10976, west, bay, street, nassau, bahamas]"
116,24000117,"GOODMANS BAY CORPORATE CENTRE SUITE 261, P.O. BOX CB-12762, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas,"[goodmans, bay, corporate, centre, suite, 261, po, box, cb12762, nassau, bahamas]"
227,24000228,"C/O H & J CORPORATE SERVICES LTD., P.O. BOX CB-13278, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,co h and j corporate services ltd po box cb13278 nassau bahamas,"[co, h, and, j, corporate, services, ltd, po, box, cb13278, nassau, bahamas]"
248,24000249,"GOODMAN'S BAY CORPORATE CENTRE, P.O. BOX SS-5498 SUITE# 261 WEST BAY ST, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans bay corporate centre po box ss5498 suite 261 west bay street nassau bahamas,"[goodmans, bay, corporate, centre, po, box, ss5498, suite, 261, west, bay, street, nassau, bahamas]"
268,24000269,"GOODMANS BAY CORPORATE CENTRE, P.O. BOX CB-12407, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans bay corporate centre po box cb12407 nassau bahamas,"[goodmans, bay, corporate, centre, po, box, cb12407, nassau, bahamas]"
301,24000302,"P.O. BOX SS-5800, PRIDEROCK CORPORATE CENTRE, SUITE 200, BAY & EAST STS., NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box ss5800 priderock corporate centre suite 200 bay and east street nassau bahamas,"[po, box, ss5800, priderock, corporate, centre, suite, 200, bay, and, east, street, nassau, bahamas]"
476,24000477,"GOODMAN'S CORPORATE CENTER, WEST BAY ST., P.O. BOX SP-61567, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,goodmans corporate center west bay street po box sp61567 nassau bahamas,"[goodmans, corporate, center, west, bay, street, po, box, sp61567, nassau, bahamas]"
483,24000484,"SUITE 6A JASMINE CORPORATE CENTRE, P.O. BOX F-402823 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite 6a jasmine corporate centre po box f402823 freeport grand bahama,"[suite, 6a, jasmine, corporate, centre, po, box, f402823, freeport, grand, bahama]"


In [61]:
df[df['address_wordlist'].apply(lambda x: 'bahams' in x)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
274,24000275,"P.O. BOX N 8680, NASSAU, BAHAMS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n 8680 nassau bahams,"[po, box, n, 8680, nassau, bahams]"
559,14018044,4TH FLOOR THE BAHAMAS FINANCIAL CENTRE SHIRLEY & CHARLOTTE STREET NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,fourth floor the bahamas financial centre shirley and charlotte street nassau bahams,"[fourth, floor, the, bahamas, financial, centre, shirley, and, charlotte, street, nassau, bahams]"
631,14030207,BAHAMS FINANCILA CENTRE PO BOX N-3023 SHIRLEY & CHARLOTTE STREETSNASSAU BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahams financila centre po box n3023 shirley and charlotte street nassau bahamas,"[bahams, financila, centre, po, box, n3023, shirley, and, charlotte, street, nassau, bahamas]"
826,14050608,MOSSACK FONSECA & CO (BAHAMS) LIMITED SAFFREY SQUARE; SUITE 205; BANK LANE; P.O.BOX N-8188; NASSAU; COMMONWEALTH OF THE BAHAMAS.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,mossack fonseca and co bahams limited saffrey square suite 205 bank lane po box n8188 nassau commonwealth of the bahamas,"[mossack, fonseca, and, co, bahams, limited, saffrey, square, suite, 205, bank, lane, po, box, n8188, nassau, commonwealth, of, the, bahamas]"
867,14064246,P.O.BOX N-3944; PROVIDENCE HOUSE; EAST HILL STREET; NASSAU; BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n3944 providence house east hill street nassau bahams,"[po, box, n3944, providence, house, east, hill, street, nassau, bahams]"
889,14064268,P O BOX N8188 NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n8188 nassau bahams,"[po, box, n8188, nassau, bahams]"
1910,33000104,"NASSAU, BAHAMS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,nassau bahams,"[nassau, bahams]"


In [75]:
df[df['address_wordlist'].apply(lambda x: 'bhs' in x)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
594,14030170,Bahamas Financial Center; Charlotte & Shirley Streets; Nassau; Bahamas; BHS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahamas financial center charlotte and shirley street nassau bahamas bhs,"[bahamas, financial, center, charlotte, and, shirley, street, nassau, bahamas, bhs]"
1102,14080652,The Bahamas Financial Centre; Shirley & Charlotte Streets; Nassau; BHS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street nassau bhs,"[the, bahamas, financial, centre, shirley, and, charlotte, street, nassau, bhs]"


In [82]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['ratio_score']>50)].sort_values('ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
5572,4,street,1199,streeet,92.307692,90.909091,92.307692,92.307692,97.142857
5174,4,street,322,stret,90.909091,88.888889,90.909091,90.909091,96.666667
5278,4,street,607,stree,90.909091,100.0,90.909091,90.909091,96.666667
5997,4,street,1791,streeets,85.714286,90.909091,85.714286,85.714286,95.0
5906,4,street,1630,strets,83.333333,90.909091,83.333333,83.333333,93.333333
5559,4,street,1179,streer,83.333333,90.909091,83.333333,83.333333,93.333333
5874,4,street,1580,steret,83.333333,83.333333,83.333333,83.333333,95.555556
5431,4,street,947,strees,83.333333,90.909091,83.333333,83.333333,93.333333
6088,4,street,1969,stre,80.0,100.0,80.0,80.0,93.333333
5975,4,street,1744,treetops,71.428571,90.909091,71.428571,71.428571,81.944444


In [83]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['partial_ratio_score']>50)].sort_values('partial_ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
5310,4,street,713,ee,50.000000,100.000000,50.000000,50.000000,55.555556
5979,4,street,1752,t,28.571429,100.000000,28.571429,28.571429,72.222222
5215,4,street,433,r,28.571429,100.000000,28.571429,28.571429,72.222222
5278,4,street,607,stree,90.909091,100.000000,90.909091,90.909091,96.666667
5104,4,street,123,s,28.571429,100.000000,28.571429,28.571429,75.000000
...,...,...,...,...,...,...,...,...,...
5764,4,street,1443,systems,46.153846,54.545455,46.153846,46.153846,64.285714
5763,4,street,1442,netware,46.153846,54.545455,46.153846,46.153846,53.174603
5752,4,street,1430,microsystems,33.333333,54.545455,33.333333,33.333333,50.000000
5713,4,street,1381,ventures,42.857143,54.545455,42.857143,42.857143,63.888889


In [84]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['token_sort_score']>50)].sort_values('token_sort_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
5572,4,street,1199,streeet,92.307692,90.909091,92.307692,92.307692,97.142857
5174,4,street,322,stret,90.909091,88.888889,90.909091,90.909091,96.666667
5278,4,street,607,stree,90.909091,100.0,90.909091,90.909091,96.666667
5997,4,street,1791,streeets,85.714286,90.909091,85.714286,85.714286,95.0
5906,4,street,1630,strets,83.333333,90.909091,83.333333,83.333333,93.333333
5559,4,street,1179,streer,83.333333,90.909091,83.333333,83.333333,93.333333
5874,4,street,1580,steret,83.333333,83.333333,83.333333,83.333333,95.555556
5431,4,street,947,strees,83.333333,90.909091,83.333333,83.333333,93.333333
6088,4,street,1969,stre,80.0,100.0,80.0,80.0,93.333333
5975,4,street,1744,treetops,71.428571,90.909091,71.428571,71.428571,81.944444


In [85]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['token_set_score']>50)].sort_values('token_set_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
5572,4,street,1199,streeet,92.307692,90.909091,92.307692,92.307692,97.142857
5174,4,street,322,stret,90.909091,88.888889,90.909091,90.909091,96.666667
5278,4,street,607,stree,90.909091,100.0,90.909091,90.909091,96.666667
5997,4,street,1791,streeets,85.714286,90.909091,85.714286,85.714286,95.0
5906,4,street,1630,strets,83.333333,90.909091,83.333333,83.333333,93.333333
5559,4,street,1179,streer,83.333333,90.909091,83.333333,83.333333,93.333333
5874,4,street,1580,steret,83.333333,83.333333,83.333333,83.333333,95.555556
5431,4,street,947,strees,83.333333,90.909091,83.333333,83.333333,93.333333
6088,4,street,1969,stre,80.0,100.0,80.0,80.0,93.333333
5975,4,street,1744,treetops,71.428571,90.909091,71.428571,71.428571,81.944444


In [86]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['jaro_winkler_score']>50)].sort_values('jaro_winkler_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
5572,4,street,1199,streeet,92.307692,90.909091,92.307692,92.307692,97.142857
5174,4,street,322,stret,90.909091,88.888889,90.909091,90.909091,96.666667
5278,4,street,607,stree,90.909091,100.000000,90.909091,90.909091,96.666667
5874,4,street,1580,steret,83.333333,83.333333,83.333333,83.333333,95.555556
5997,4,street,1791,streeets,85.714286,90.909091,85.714286,85.714286,95.000000
...,...,...,...,...,...,...,...,...,...
5792,4,street,1479,consultores,47.058824,54.545455,47.058824,47.058824,50.505051
5651,4,street,1305,consultants,35.294118,36.363636,35.294118,35.294118,50.505051
5556,4,street,1171,redomiciled,35.294118,50.000000,35.294118,35.294118,50.505051
5695,4,street,1357,association,23.529412,33.333333,23.529412,23.529412,50.505051


In [92]:
word_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='street') & (fuzzy_words_df['jaro_winkler_score']>75)].sort_values('jaro_winkler_score', ascending=False)['match_value']
for word in word_list:
    print(word)
    display(df[df['address_wordlist'].apply(lambda x: word in x)])
    print('\n')

bahamasc


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
570,14018521,51 Frederick Street; P.O. Box N-1136; Nassau; BahamasC,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,51 frederick street po box n1136 nassau bahamasc,"[51, frederick, street, po, box, n1136, nassau, bahamasc]"




bahamasa


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1442,240054,"Winterbotham Place Marlborough & Queen Streets PO Box CB 11343 Nassau, Bahamasa",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,winterbotham place marlborough and queen street po box cb 11343 nassau bahamasa,"[winterbotham, place, marlborough, and, queen, street, po, box, cb, 11343, nassau, bahamasa]"




bahamas6


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1041,14080001,SUITE E-2; UNION COURT BUILDING; ELIZABETH AVENUE AND SHIRLEY STREET; NASSAU; THE BAHAMAS6,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street nassau the bahamas6,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, nassau, the, bahamas6]"




bahamaas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
834,14051201,"NASSAU, BAHAMAAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,nassau bahamaas,"[nassau, bahamaas]"




bahamas1


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
587,14028501,Atlantic House; 3rd Floor; Collins Avenue & 2nd Terrace; Nassau; Bahamas1,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,atlantic house third floor collins avenue and second terrace nassau bahamas1,"[atlantic, house, third, floor, collins, avenue, and, second, terrace, nassau, bahamas1]"
1177,14085238,WINTERBOTHAM PLACE; MARLBOROUGH & QUEEN STREETS; P.O. BOX N-7523; NASSAU; BAHAMAS1,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,winterbotham place marlborough and queen street po box n7523 nassau bahamas1,"[winterbotham, place, marlborough, and, queen, street, po, box, n7523, nassau, bahamas1]"




bahama


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
76,24000077,"P.O. BOX F-40773, FREEPORT, GR. BAHAMA 242-352-7291",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box f40773 freeport gr bahama 2423527291,"[po, box, f40773, freeport, gr, bahama, 2423527291]"
79,24000080,"REGENT CENTRE, P.O. BOX F-40132 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,regent centre po box f40132 freeport grand bahama,"[regent, centre, po, box, f40132, freeport, grand, bahama]"
83,24000084,"CHANCERY HOUSE, P.O. BOX F-42578 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,chancery house po box f42578 freeport grand bahama,"[chancery, house, po, box, f42578, freeport, grand, bahama]"
87,24000088,"CHANCERY COURT THE MALL, P.O. BOX F-42643 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,chancery court the mall po box f42643 freeport grand bahama,"[chancery, court, the, mall, po, box, f42643, freeport, grand, bahama]"
104,24000105,"SUITE A, REGENT CENTRE, P.O. BOX F-42682 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite a regent centre po box f42682 freeport grand bahama,"[suite, a, regent, centre, po, box, f42682, freeport, grand, bahama]"
...,...,...,...,...,...,...,...,...,...,...
2083,33000290,"REGENT CENTRE PO BOX F-40132 FREEPORT, GR BAHAMA, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,regent centre po box f40132 freeport gr bahama bahamas,"[regent, centre, po, box, f40132, freeport, gr, bahama, bahamas]"
2084,33000291,"REGENT CENTRE PO BOX F-40132 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,regent centre po box f40132 freeport grand bahama,"[regent, centre, po, box, f40132, freeport, grand, bahama]"
2085,33000293,"SUITE 10 SEVENTEEN CENTRE, BANK LANE PO BOX F-43018 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,suite 10 seventeen centre bank lane po box f43018 freeport grand bahama,"[suite, 10, seventeen, centre, bank, lane, po, box, f43018, freeport, grand, bahama]"
2091,33000299,"FIRST COMMERCIAL CENTRE SUITE 1, 2ND FL PO BOX F-42411 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,first commercial centre suite 1 second fl po box f42411 freeport grand bahama,"[first, commercial, centre, suite, 1, second, fl, po, box, f42411, freeport, grand, bahama]"




bahaams


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
725,14038328,Elizabeth Avenue and Shirley Street; Union Court Building; Suite E-2; N-8188; Nassau; Bahaams,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,elizabeth avenue and shirley street union court building suite e 2 n 8188 nassau bahaams,"[elizabeth, avenue, and, shirley, street, union, court, building, suite, e, 2, n, 8188, nassau, bahaams]"




bahams


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
274,24000275,"P.O. BOX N 8680, NASSAU, BAHAMS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n 8680 nassau bahams,"[po, box, n, 8680, nassau, bahams]"
559,14018044,4TH FLOOR THE BAHAMAS FINANCIAL CENTRE SHIRLEY & CHARLOTTE STREET NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,fourth floor the bahamas financial centre shirley and charlotte street nassau bahams,"[fourth, floor, the, bahamas, financial, centre, shirley, and, charlotte, street, nassau, bahams]"
631,14030207,BAHAMS FINANCILA CENTRE PO BOX N-3023 SHIRLEY & CHARLOTTE STREETSNASSAU BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahams financila centre po box n3023 shirley and charlotte street nassau bahamas,"[bahams, financila, centre, po, box, n3023, shirley, and, charlotte, street, nassau, bahamas]"
826,14050608,MOSSACK FONSECA & CO (BAHAMS) LIMITED SAFFREY SQUARE; SUITE 205; BANK LANE; P.O.BOX N-8188; NASSAU; COMMONWEALTH OF THE BAHAMAS.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,mossack fonseca and co bahams limited saffrey square suite 205 bank lane po box n8188 nassau commonwealth of the bahamas,"[mossack, fonseca, and, co, bahams, limited, saffrey, square, suite, 205, bank, lane, po, box, n8188, nassau, commonwealth, of, the, bahamas]"
867,14064246,P.O.BOX N-3944; PROVIDENCE HOUSE; EAST HILL STREET; NASSAU; BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n3944 providence house east hill street nassau bahams,"[po, box, n3944, providence, house, east, hill, street, nassau, bahams]"
889,14064268,P O BOX N8188 NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n8188 nassau bahams,"[po, box, n8188, nassau, bahams]"
1910,33000104,"NASSAU, BAHAMS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,nassau bahams,"[nassau, bahams]"




bahamaspo


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
878,14064257,P.O. Box N-7768; Nassau; BahamasP.O. Box N-7768; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n7768 nassau bahamaspo box n7768 nassau bahamas,"[po, box, n7768, nassau, bahamaspo, box, n7768, nassau, bahamas]"




ahamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
576,14025414,ahamas Financial Centre; 4th Floor; Shirley & Charlotte Street; Nassau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,ahamas financial centre fourth floor shirley and charlotte street nassau bahamas,"[ahamas, financial, centre, fourth, floor, shirley, and, charlotte, street, nassau, bahamas]"




bahanas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
875,14064254,P.O. Box N-7757; East Bay Street; Nassau; Bahanas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n7757 east bay street nassau bahanas,"[po, box, n7757, east, bay, street, nassau, bahanas]"




baham


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1930,33000127,"CHANCERY COURT, THE MALL PO BOX F-42519 FREEPORT, GRAND BAHAM BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,chancery court the mall po box f42519 freeport grand baham bahamas,"[chancery, court, the, mall, po, box, f42519, freeport, grand, baham, bahamas]"




bahmas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
563,14018385,50 Shirley Street; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,50 shirley street nassau bahmas,"[50, shirley, street, nassau, bahmas]"
668,14033053,c/o Morgan Trust Company of The Bahamas Limited P.O. Box N-4899; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,co morgan trust company of the bahamas limited po box n4899 nassau bahmas,"[co, morgan, trust, company, of, the, bahamas, limited, po, box, n4899, nassau, bahmas]"
749,14042830,FOURTH FLOOR; THE BAHAMAS FINANCIAL CENTRE; SHIRLEY & CHARLOTTE STREETS; P.O.BOX N-3023; NASSAU; BAHMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,fourth floor the bahamas financial centre shirley and charlotte street po box n3023 nassau bahmas,"[fourth, floor, the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassau, bahmas]"
932,14077074,Saffrey Square; Suite 205; Bank Lane; P.O. Box N-8188; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,saffrey square suite 205 bank lane po box n8188 nassau bahmas,"[saffrey, square, suite, 205, bank, lane, po, box, n8188, nassau, bahmas]"
1152,14085026,"WEST BAY STREET NASSAU, BAHMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,west bay street nassau bahmas,"[west, bay, street, nassau, bahmas]"




bah


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
495,24000496,"SHIRLEY & CHARLOTTE STS BAH. FIN. CENTRE, P.O. BOX SS-6373, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,shirley and charlotte street bah fin centre po box ss6373 nassau bahamas,"[shirley, and, charlotte, street, bah, fin, centre, po, box, ss6373, nassau, bahamas]"
760,14043538,GOODMAN S BAY CORPORATE CENTER WEST BAY STREET NASSAU BAH,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,goodman s bay corporate center west bay street nassau bah,"[goodman, s, bay, corporate, center, west, bay, street, nassau, bah]"




brahmas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1120,14080679,"The Brahmas Financial Centre, Shirley and Charlotte Streets P O Box N - 3023 Nassau Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the brahmas financial centre shirley and charlotte street po box n 3023 nassau bahamas,"[the, brahmas, financial, centre, shirley, and, charlotte, street, po, box, n, 3023, nassau, bahamas]"




bhamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
390,24000391,"P.O. BOX N-4485, NASSAU BHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n4485 nassau bhamas,"[po, box, n4485, nassau, bhamas]"




abahamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1901,33000091,NEW PROVIDENCE ABAHAMAS,,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,new providence abahamas,"[new, providence, abahamas]"




ba


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
833,14051200,Nassau-BA-Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,nassau ba bahamas,"[nassau, ba, bahamas]"
951,14077696,"Sede Nassau-BA (capital), Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,sede nassau ba capital bahamas,"[sede, nassau, ba, capital, bahamas]"




bazaar


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
549,14012324,2nd Floor; International Bazaar; Bay Street; P.O. Box N- 1612; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,second floor international bazaar bay street po box n 1612 nassau bahamas,"[second, floor, international, bazaar, bay, street, po, box, n, 1612, nassau, bahamas]"




bosham


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
199,24000200,"#6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS P.O. BOX SP 63801, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,6 bosham close camperdown heights po box sp 63801 nassau bahamas,"[6, bosham, close, camperdown, heights, po, box, sp, 63801, nassau, bahamas]"
1887,33000077,"#6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS PO BOX SP 63801, NASSAU, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,6 bosham close camperdown heights po box sp 63801 nassau bahamas,"[6, bosham, close, camperdown, heights, po, box, sp, 63801, nassau, bahamas]"
2180,240003759,"NO. 6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS NEW PROVIDENCE BAHAMAS",,Bahamas,BHS,"Pandora Papers - Alemán, Cordero, Galindo & Lee (Alcogal)",Provider data is current through 2018,,no 6 bosham close camperdown heights new providence bahamas,"[no, 6, bosham, close, camperdown, heights, new, providence, bahamas]"




hamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
2243,240492153,"MONTAGUE STERLING CENTRE. EAST BAY STREET, NASSAU, HAMAS, SWITZERLAND, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montague sterling centre east bay street nassau hamas switzerland bahamas,"[montague, sterling, centre, east, bay, street, nassau, hamas, switzerland, bahamas]"






### Bahamas

In [87]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['ratio_score']>60)].sort_values('ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
10611,9,bahamas,1201,bahamas6,93.333333,100.0,93.333333,93.333333,97.5
11095,9,bahamas,1912,abahamas,93.333333,100.0,93.333333,93.333333,81.547619
10428,9,bahamas,907,bahamasc,93.333333,100.0,93.333333,93.333333,97.5
10443,9,bahamas,930,bahamas1,93.333333,100.0,93.333333,93.333333,97.5
10917,9,bahamas,1603,bahamasa,93.333333,100.0,93.333333,93.333333,97.5
10551,9,bahamas,1101,bahamaas,93.333333,92.307692,93.333333,93.333333,97.5
10114,9,bahamas,188,bahama,92.307692,100.0,92.307692,92.307692,97.142857
10240,9,bahamas,516,bahams,92.307692,90.909091,92.307692,92.307692,97.142857
10307,9,bahamas,659,bhamas,92.307692,90.909091,92.307692,92.307692,85.714286
10427,9,bahamas,904,bahmas,92.307692,83.333333,92.307692,92.307692,92.777778


In [88]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['partial_ratio_score']>70)].sort_values('partial_ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
10431,9,bahamas,914,ahamas,92.307692,100.0,92.307692,92.307692,95.238095
10428,9,bahamas,907,bahamasc,93.333333,100.0,93.333333,93.333333,97.5
10611,9,bahamas,1201,bahamas6,93.333333,100.0,93.333333,93.333333,97.5
10596,9,bahamas,1184,nassaubahamas,70.0,100.0,70.0,70.0,54.304029
11095,9,bahamas,1912,abahamas,93.333333,100.0,93.333333,93.333333,81.547619
10558,9,bahamas,1125,bahamaspo,87.5,100.0,87.5,87.5,95.555556
10089,9,bahamas,123,s,25.0,100.0,25.0,25.0,0.0
10550,9,bahamas,1100,ba,44.444444,100.0,44.444444,44.444444,80.952381
10443,9,bahamas,930,bahamas1,93.333333,100.0,93.333333,93.333333,97.5
10439,9,bahamas,926,as,44.444444,100.0,44.444444,44.444444,54.761905


In [89]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['token_sort_score']>65)].sort_values('token_sort_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
10428,9,bahamas,907,bahamasc,93.333333,100.0,93.333333,93.333333,97.5
10443,9,bahamas,930,bahamas1,93.333333,100.0,93.333333,93.333333,97.5
11095,9,bahamas,1912,abahamas,93.333333,100.0,93.333333,93.333333,81.547619
10551,9,bahamas,1101,bahamaas,93.333333,92.307692,93.333333,93.333333,97.5
10917,9,bahamas,1603,bahamasa,93.333333,100.0,93.333333,93.333333,97.5
10611,9,bahamas,1201,bahamas6,93.333333,100.0,93.333333,93.333333,97.5
10114,9,bahamas,188,bahama,92.307692,100.0,92.307692,92.307692,97.142857
10307,9,bahamas,659,bhamas,92.307692,90.909091,92.307692,92.307692,85.714286
10427,9,bahamas,904,bahmas,92.307692,83.333333,92.307692,92.307692,92.777778
10431,9,bahamas,914,ahamas,92.307692,100.0,92.307692,92.307692,95.238095


In [90]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['token_set_score']>65)].sort_values('token_set_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
10428,9,bahamas,907,bahamasc,93.333333,100.0,93.333333,93.333333,97.5
10443,9,bahamas,930,bahamas1,93.333333,100.0,93.333333,93.333333,97.5
11095,9,bahamas,1912,abahamas,93.333333,100.0,93.333333,93.333333,81.547619
10551,9,bahamas,1101,bahamaas,93.333333,92.307692,93.333333,93.333333,97.5
10917,9,bahamas,1603,bahamasa,93.333333,100.0,93.333333,93.333333,97.5
10611,9,bahamas,1201,bahamas6,93.333333,100.0,93.333333,93.333333,97.5
10114,9,bahamas,188,bahama,92.307692,100.0,92.307692,92.307692,97.142857
10307,9,bahamas,659,bhamas,92.307692,90.909091,92.307692,92.307692,85.714286
10427,9,bahamas,904,bahmas,92.307692,83.333333,92.307692,92.307692,92.777778
10431,9,bahamas,914,ahamas,92.307692,100.0,92.307692,92.307692,95.238095


In [91]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['jaro_winkler_score']>75)].sort_values('jaro_winkler_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
10428,9,bahamas,907,bahamasc,93.333333,100.0,93.333333,93.333333,97.5
10917,9,bahamas,1603,bahamasa,93.333333,100.0,93.333333,93.333333,97.5
10611,9,bahamas,1201,bahamas6,93.333333,100.0,93.333333,93.333333,97.5
10551,9,bahamas,1101,bahamaas,93.333333,92.307692,93.333333,93.333333,97.5
10443,9,bahamas,930,bahamas1,93.333333,100.0,93.333333,93.333333,97.5
10114,9,bahamas,188,bahama,92.307692,100.0,92.307692,92.307692,97.142857
10498,9,bahamas,1018,bahaams,85.714286,85.714286,85.714286,85.714286,97.142857
10240,9,bahamas,516,bahams,92.307692,90.909091,92.307692,92.307692,97.142857
10558,9,bahamas,1125,bahamaspo,87.5,100.0,87.5,87.5,95.555556
10431,9,bahamas,914,ahamas,92.307692,100.0,92.307692,92.307692,95.238095


In [92]:
word_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['jaro_winkler_score']>75)].sort_values('jaro_winkler_score', ascending=False)['match_value']
for word in word_list:
    print(word)
    display(df[df['address_wordlist'].apply(lambda x: word in x)])
    print('\n')

bahamasc


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
570,14018521,51 Frederick Street; P.O. Box N-1136; Nassau; BahamasC,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,51 frederick street po box n1136 nassau bahamasc,"[51, frederick, street, po, box, n1136, nassau, bahamasc]"




bahamasa


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1442,240054,"Winterbotham Place Marlborough & Queen Streets PO Box CB 11343 Nassau, Bahamasa",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,winterbotham place marlborough and queen street po box cb 11343 nassau bahamasa,"[winterbotham, place, marlborough, and, queen, street, po, box, cb, 11343, nassau, bahamasa]"




bahamas6


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1041,14080001,SUITE E-2; UNION COURT BUILDING; ELIZABETH AVENUE AND SHIRLEY STREET; NASSAU; THE BAHAMAS6,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street nassau the bahamas6,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, nassau, the, bahamas6]"




bahamaas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
834,14051201,"NASSAU, BAHAMAAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,nassau bahamaas,"[nassau, bahamaas]"




bahamas1


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
587,14028501,Atlantic House; 3rd Floor; Collins Avenue & 2nd Terrace; Nassau; Bahamas1,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,atlantic house third floor collins avenue and second terrace nassau bahamas1,"[atlantic, house, third, floor, collins, avenue, and, second, terrace, nassau, bahamas1]"
1177,14085238,WINTERBOTHAM PLACE; MARLBOROUGH & QUEEN STREETS; P.O. BOX N-7523; NASSAU; BAHAMAS1,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,winterbotham place marlborough and queen street po box n7523 nassau bahamas1,"[winterbotham, place, marlborough, and, queen, street, po, box, n7523, nassau, bahamas1]"




bahama


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
76,24000077,"P.O. BOX F-40773, FREEPORT, GR. BAHAMA 242-352-7291",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box f40773 freeport gr bahama 2423527291,"[po, box, f40773, freeport, gr, bahama, 2423527291]"
79,24000080,"REGENT CENTRE, P.O. BOX F-40132 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,regent centre po box f40132 freeport grand bahama,"[regent, centre, po, box, f40132, freeport, grand, bahama]"
83,24000084,"CHANCERY HOUSE, P.O. BOX F-42578 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,chancery house po box f42578 freeport grand bahama,"[chancery, house, po, box, f42578, freeport, grand, bahama]"
87,24000088,"CHANCERY COURT THE MALL, P.O. BOX F-42643 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,chancery court the mall po box f42643 freeport grand bahama,"[chancery, court, the, mall, po, box, f42643, freeport, grand, bahama]"
104,24000105,"SUITE A, REGENT CENTRE, P.O. BOX F-42682 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite a regent centre po box f42682 freeport grand bahama,"[suite, a, regent, centre, po, box, f42682, freeport, grand, bahama]"
...,...,...,...,...,...,...,...,...,...,...
2083,33000290,"REGENT CENTRE PO BOX F-40132 FREEPORT, GR BAHAMA, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,regent centre po box f40132 freeport gr bahama bahamas,"[regent, centre, po, box, f40132, freeport, gr, bahama, bahamas]"
2084,33000291,"REGENT CENTRE PO BOX F-40132 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,regent centre po box f40132 freeport grand bahama,"[regent, centre, po, box, f40132, freeport, grand, bahama]"
2085,33000293,"SUITE 10 SEVENTEEN CENTRE, BANK LANE PO BOX F-43018 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,suite 10 seventeen centre bank lane po box f43018 freeport grand bahama,"[suite, 10, seventeen, centre, bank, lane, po, box, f43018, freeport, grand, bahama]"
2091,33000299,"FIRST COMMERCIAL CENTRE SUITE 1, 2ND FL PO BOX F-42411 FREEPORT, GRAND BAHAMA",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,first commercial centre suite 1 second fl po box f42411 freeport grand bahama,"[first, commercial, centre, suite, 1, second, fl, po, box, f42411, freeport, grand, bahama]"




bahaams


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
725,14038328,Elizabeth Avenue and Shirley Street; Union Court Building; Suite E-2; N-8188; Nassau; Bahaams,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,elizabeth avenue and shirley street union court building suite e 2 n 8188 nassau bahaams,"[elizabeth, avenue, and, shirley, street, union, court, building, suite, e, 2, n, 8188, nassau, bahaams]"




bahams


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
274,24000275,"P.O. BOX N 8680, NASSAU, BAHAMS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n 8680 nassau bahams,"[po, box, n, 8680, nassau, bahams]"
559,14018044,4TH FLOOR THE BAHAMAS FINANCIAL CENTRE SHIRLEY & CHARLOTTE STREET NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,fourth floor the bahamas financial centre shirley and charlotte street nassau bahams,"[fourth, floor, the, bahamas, financial, centre, shirley, and, charlotte, street, nassau, bahams]"
631,14030207,BAHAMS FINANCILA CENTRE PO BOX N-3023 SHIRLEY & CHARLOTTE STREETSNASSAU BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahams financila centre po box n3023 shirley and charlotte street nassau bahamas,"[bahams, financila, centre, po, box, n3023, shirley, and, charlotte, street, nassau, bahamas]"
826,14050608,MOSSACK FONSECA & CO (BAHAMS) LIMITED SAFFREY SQUARE; SUITE 205; BANK LANE; P.O.BOX N-8188; NASSAU; COMMONWEALTH OF THE BAHAMAS.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,mossack fonseca and co bahams limited saffrey square suite 205 bank lane po box n8188 nassau commonwealth of the bahamas,"[mossack, fonseca, and, co, bahams, limited, saffrey, square, suite, 205, bank, lane, po, box, n8188, nassau, commonwealth, of, the, bahamas]"
867,14064246,P.O.BOX N-3944; PROVIDENCE HOUSE; EAST HILL STREET; NASSAU; BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n3944 providence house east hill street nassau bahams,"[po, box, n3944, providence, house, east, hill, street, nassau, bahams]"
889,14064268,P O BOX N8188 NASSAU BAHAMS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n8188 nassau bahams,"[po, box, n8188, nassau, bahams]"
1910,33000104,"NASSAU, BAHAMS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,nassau bahams,"[nassau, bahams]"




bahamaspo


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
878,14064257,P.O. Box N-7768; Nassau; BahamasP.O. Box N-7768; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n7768 nassau bahamaspo box n7768 nassau bahamas,"[po, box, n7768, nassau, bahamaspo, box, n7768, nassau, bahamas]"




ahamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
576,14025414,ahamas Financial Centre; 4th Floor; Shirley & Charlotte Street; Nassau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,ahamas financial centre fourth floor shirley and charlotte street nassau bahamas,"[ahamas, financial, centre, fourth, floor, shirley, and, charlotte, street, nassau, bahamas]"




bahanas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
875,14064254,P.O. Box N-7757; East Bay Street; Nassau; Bahanas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,po box n7757 east bay street nassau bahanas,"[po, box, n7757, east, bay, street, nassau, bahanas]"




baham


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1930,33000127,"CHANCERY COURT, THE MALL PO BOX F-42519 FREEPORT, GRAND BAHAM BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,chancery court the mall po box f42519 freeport grand baham bahamas,"[chancery, court, the, mall, po, box, f42519, freeport, grand, baham, bahamas]"




bahmas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
563,14018385,50 Shirley Street; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,50 shirley street nassau bahmas,"[50, shirley, street, nassau, bahmas]"
668,14033053,c/o Morgan Trust Company of The Bahamas Limited P.O. Box N-4899; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,co morgan trust company of the bahamas limited po box n4899 nassau bahmas,"[co, morgan, trust, company, of, the, bahamas, limited, po, box, n4899, nassau, bahmas]"
749,14042830,FOURTH FLOOR; THE BAHAMAS FINANCIAL CENTRE; SHIRLEY & CHARLOTTE STREETS; P.O.BOX N-3023; NASSAU; BAHMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,fourth floor the bahamas financial centre shirley and charlotte street po box n3023 nassau bahmas,"[fourth, floor, the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassau, bahmas]"
932,14077074,Saffrey Square; Suite 205; Bank Lane; P.O. Box N-8188; Nassau; Bahmas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,saffrey square suite 205 bank lane po box n8188 nassau bahmas,"[saffrey, square, suite, 205, bank, lane, po, box, n8188, nassau, bahmas]"
1152,14085026,"WEST BAY STREET NASSAU, BAHMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,west bay street nassau bahmas,"[west, bay, street, nassau, bahmas]"




bah


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
495,24000496,"SHIRLEY & CHARLOTTE STS BAH. FIN. CENTRE, P.O. BOX SS-6373, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,shirley and charlotte street bah fin centre po box ss6373 nassau bahamas,"[shirley, and, charlotte, street, bah, fin, centre, po, box, ss6373, nassau, bahamas]"
760,14043538,GOODMAN S BAY CORPORATE CENTER WEST BAY STREET NASSAU BAH,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,goodman s bay corporate center west bay street nassau bah,"[goodman, s, bay, corporate, center, west, bay, street, nassau, bah]"




brahmas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1120,14080679,"The Brahmas Financial Centre, Shirley and Charlotte Streets P O Box N - 3023 Nassau Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the brahmas financial centre shirley and charlotte street po box n 3023 nassau bahamas,"[the, brahmas, financial, centre, shirley, and, charlotte, street, po, box, n, 3023, nassau, bahamas]"




bhamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
390,24000391,"P.O. BOX N-4485, NASSAU BHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n4485 nassau bhamas,"[po, box, n4485, nassau, bhamas]"




abahamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1901,33000091,NEW PROVIDENCE ABAHAMAS,,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,new providence abahamas,"[new, providence, abahamas]"




ba


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
833,14051200,Nassau-BA-Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,nassau ba bahamas,"[nassau, ba, bahamas]"
951,14077696,"Sede Nassau-BA (capital), Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,sede nassau ba capital bahamas,"[sede, nassau, ba, capital, bahamas]"




bazaar


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
549,14012324,2nd Floor; International Bazaar; Bay Street; P.O. Box N- 1612; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,second floor international bazaar bay street po box n 1612 nassau bahamas,"[second, floor, international, bazaar, bay, street, po, box, n, 1612, nassau, bahamas]"




bosham


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
199,24000200,"#6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS P.O. BOX SP 63801, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,6 bosham close camperdown heights po box sp 63801 nassau bahamas,"[6, bosham, close, camperdown, heights, po, box, sp, 63801, nassau, bahamas]"
1887,33000077,"#6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS PO BOX SP 63801, NASSAU, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,6 bosham close camperdown heights po box sp 63801 nassau bahamas,"[6, bosham, close, camperdown, heights, po, box, sp, 63801, nassau, bahamas]"
2180,240003759,"NO. 6 BOSHAM CLOSE, CAMPERDOWN HEIGHTS NEW PROVIDENCE BAHAMAS",,Bahamas,BHS,"Pandora Papers - Alemán, Cordero, Galindo & Lee (Alcogal)",Provider data is current through 2018,,no 6 bosham close camperdown heights new providence bahamas,"[no, 6, bosham, close, camperdown, heights, new, providence, bahamas]"




hamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
2243,240492153,"MONTAGUE STERLING CENTRE. EAST BAY STREET, NASSAU, HAMAS, SWITZERLAND, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montague sterling centre east bay street nassau hamas switzerland bahamas,"[montague, sterling, centre, east, bay, street, nassau, hamas, switzerland, bahamas]"






### Nassau

In [93]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>60)].sort_values('ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
9116,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
9254,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
9404,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
9460,8,nassau,1225,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
9753,8,nassau,1594,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
9223,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
9431,8,nassau,1195,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
9356,8,nassau,1083,massau,83.333333,90.909091,83.333333,83.333333,88.888889
9393,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
9422,8,nassau,1186,nassao,83.333333,90.909091,83.333333,83.333333,93.333333


In [94]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['partial_ratio_score']>70)].sort_values('partial_ratio_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
8660,8,nassau,49,n,28.571429,100.0,28.571429,28.571429,75.0
9420,8,nassau,1184,nassaubahamas,63.157895,100.0,63.157895,63.157895,89.230769
8798,8,nassau,238,a,28.571429,100.0,28.571429,28.571429,72.222222
9001,8,nassau,530,ss,50.0,100.0,50.0,50.0,77.777778
9860,8,nassau,1756,na,50.0,100.0,50.0,50.0,82.222222
9460,8,nassau,1225,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
9254,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
9255,8,nassau,926,as,50.0,100.0,50.0,50.0,77.777778
9296,8,nassau,993,343nassau,80.0,100.0,80.0,80.0,88.888889
8713,8,nassau,123,s,28.571429,100.0,28.571429,28.571429,72.222222


In [100]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['token_sort_score']>=80)].sort_values('token_sort_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
9116,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
9254,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
9404,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
9460,8,nassau,1225,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
9753,8,nassau,1594,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
9223,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
9431,8,nassau,1195,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
9356,8,nassau,1083,massau,83.333333,90.909091,83.333333,83.333333,88.888889
9393,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
9422,8,nassau,1186,nassao,83.333333,90.909091,83.333333,83.333333,93.333333


In [103]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['token_set_score']>75)].sort_values('token_set_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
9116,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
9254,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
9404,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
9460,8,nassau,1225,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
9753,8,nassau,1594,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
9223,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
9431,8,nassau,1195,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
9356,8,nassau,1083,massau,83.333333,90.909091,83.333333,83.333333,88.888889
9393,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
9422,8,nassau,1186,nassao,83.333333,90.909091,83.333333,83.333333,93.333333


In [107]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['jaro_winkler_score']>85)].sort_values('jaro_winkler_score', ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
9116,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
9254,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
9404,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
9460,8,nassau,1225,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
9393,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
9431,8,nassau,1195,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
9753,8,nassau,1594,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
9223,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
9422,8,nassau,1186,nassao,83.333333,90.909091,83.333333,83.333333,93.333333
9438,8,nassau,1202,nassan,83.333333,90.909091,83.333333,83.333333,93.333333


In [98]:
word_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['jaro_winkler_score']>85)].sort_values('jaro_winkler_score', ascending=False)['match_value']
for word in word_list:
    print(word)
    display(df[df['address_wordlist'].apply(lambda x: word in x)])
    print('\n')

nassaau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
966,14078961,Suite 102; Saffrey Square; Bay Street and Bank Lane Nassaau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassaau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassaau, the, bahamas]"




nasssau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
420,24000421,"3RD FLOOR, GEORGE HOUSE, GEORGE STREET, P.O. BOX N-8159 NASSSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,third floor george house george street po box n8159 nasssau bahamas,"[third, floor, george, house, george, street, po, box, n8159, nasssau, bahamas]"




nassaub


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
583,14026897,"ANSBACHER (BAHAMAS) LIMITED P.O. BOX N 7768 ANSBACHER HOUSE BANK LANE NASSAUB, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,ansbacher bahamas limited po box n 7768 ansbacher house bank lane nassaub bahamas,"[ansbacher, bahamas, limited, po, box, n, 7768, ansbacher, house, bank, lane, nassaub, bahamas]"




nassaus


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1106,14080656,The Bahamas Financial Centre; Shirley & Charlotte Streets; PO Box N-3023; Nassaus; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassaus bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassaus, bahamas]"




nassua


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
933,14077075,SAFFREY SQUARE; SUITE 205; BANK LANE; P.O. BOX N-8188; NASSUA; BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,saffrey square suite 205 bank lane po box n8188 nassua bahamas,"[saffrey, square, suite, 205, bank, lane, po, box, n8188, nassua, bahamas]"
969,14078964,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nassua; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassua the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassua, the, bahamas]"




nassu


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1021,14079979,Suite E-2; Union Court Building; Elizabeth Avenue & Shirley Street; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street nassu bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, nassu, bahamas]"
1117,14080667,The Bahamas Financial Centre; Shirley and Charlotte Streets; P.O. Box N-3023; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassu bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassu, bahamas]"




naussau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1428,252371,"43 Elizabeth Avenue, P.O.Box CB-13022 Naussau Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,43 elizabeth avenue po box cb13022 naussau bahamas,"[43, elizabeth, avenue, po, box, cb13022, naussau, bahamas]"




nasau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
536,14000678,"101 East Hill Street, Nasau Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,101 east hill street nasau bahamas,"[101, east, hill, street, nasau, bahamas]"
612,14030188,Bahamas Financial Centre; Shirley & Charlotte Street; Fourth Flor Nasau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahamas financial centre shirley and charlotte street fourth flor nasau bahamas,"[bahamas, financial, centre, shirley, and, charlotte, street, fourth, flor, nasau, bahamas]"
682,14035228,"CB 11-343 Nasau, Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343 nasau bahamas,"[cb, 11, 343, nasau, bahamas]"
724,14038327,Elizabeth Avenue and Shirley Street; Union Court Building; Suite E-2; N-8188; Nasau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,elizabeth avenue and shirley street union court building suite e 2 n 8188 nasau bahamas,"[elizabeth, avenue, and, shirley, street, union, court, building, suite, e, 2, n, 8188, nasau, bahamas]"
965,14078960,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nasau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nasau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nasau, the, bahamas]"
1440,239867,"UBS Trustees (Bahamas) Ltd, UBS House, East Bay Street, P. O. Box N-7757, Nasau, Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,ubs trustees bahamas ltd ubs house east bay street p o box n7757 nasau bahamas,"[ubs, trustees, bahamas, ltd, ubs, house, east, bay, street, p, o, box, n7757, nasau, bahamas]"




nassao


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
999,14079956,Suite E-2; Union Court Buiding; Elizabeth Avenue and Shirley Streer; P.O. Box N-8188; Nassao; Bahamas.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e2 union court buiding elizabeth avenue and shirley streer po box n8188 nassao bahamas,"[suite, e2, union, court, buiding, elizabeth, avenue, and, shirley, streer, po, box, n8188, nassao, bahamas]"




nassan


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1043,14080003,Suite E-2; Union Court Building; Elizabeth Avenue and Shirley Street; PO Box 8188; Nassan; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street po box 8188 nassan bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, po, box, 8188, nassan, bahamas]"
1050,14080011,Suite E-2; Union Court Building; Elizabeth Avenue and Shirly Street; Nassan; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirly street nassan the bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirly, street, nassan, the, bahamas]"




nassaubahamas


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
997,14079954,SUITE E.2UNION COURTBLDG ELIZABETH AVENUEAND SHIRLEY STREET NASSAUBAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2union courtbldg elizabeth avenueand shirley street nassaubahamas,"[suite, e, 2union, courtbldg, elizabeth, avenueand, shirley, street, nassaubahamas]"
1015,14079973,Suite E-2 Union Court Building; Elizabeth Ave and Shirley Street; PO Box N-8188; NASSAU-BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e2 union court building elizabeth avenue and shirley street po box n8188 nassaubahamas,"[suite, e2, union, court, building, elizabeth, avenue, and, shirley, street, po, box, n8188, nassaubahamas]"
2126,120004587,"C/O SWISS BANK CORP., CLAUGHTON HOUSE, P.O. BOX N-7757, NASSAU/BAHAMAS","C/O SWISS BANK CORP., CLAUGHTON HOUSE, P.O. BOX N-7757, NASSAU/BAHAMAS",Bahamas,BHS,Paradise Papers - Barbados corporate registry,Barbados corporate registry data is current through 2016,,co swiss bank corp claughton house po box n7757 nassaubahamas,"[co, swiss, bank, corp, claughton, house, po, box, n7757, nassaubahamas]"




massau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
810,14049672,"MASSAU, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,massau bahamas,"[massau, bahamas]"




343nassau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
681,14035227,CB 11.343/Nassau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343nassau bahamas,"[cb, 11, 343nassau, bahamas]"




nash


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
753,14043184,GAN EDEN; NASH BUILDING; BAYVIEW DRIVE EAST; PARADISE ISLAND; BAHAMAS.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,gan eden nash building bayview drive east paradise island bahamas,"[gan, eden, nash, building, bayview, drive, east, paradise, island, bahamas]"
754,14043185,Gan Eden; Nash Building; Bayview Drive East; Paradise Island; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,gan eden nash building bayview drive east paradise island nassau bahamas,"[gan, eden, nash, building, bayview, drive, east, paradise, island, nassau, bahamas]"
755,14043186,GAN EDEN NASH BUILDING; BAYVIEW DRIVE; EAST PARADISE ISLAND SS19098 BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,gan eden nash building bayview drive east paradise island ss19098 bahamas,"[gan, eden, nash, building, bayview, drive, east, paradise, island, ss19098, bahamas]"




massa


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1150,14083823,VIA BIGINI; 43; I-54100 MASSA,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,via bigini 43 i 54100 massa,"[via, bigini, 43, i, 54100, massa]"




na


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1632,81041437,Highland Park; N/A Nassau; New Providence; Bahamas,Highland Park,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,highland park na nassau new providence bahamas,"[highland, park, na, nassau, new, providence, bahamas]"




sans


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
1744,81081901,Sans Souci; Nassau; New Providence; Bahamas,Sans Souci,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,sans souci nassau new providence bahamas,"[sans, souci, nassau, new, providence, bahamas]"




ss


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
287,24000288,"P.O. BOX SS 19051, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box ss 19051 nassau bahamas,"[po, box, ss, 19051, nassau, bahamas]"
349,24000350,"35 DURHAM STREET OFF MONTROSE AVENUE, P.O. BOX SS 6171, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,35 durham street off montrose avenue po box ss 6171 nassau bahamas,"[35, durham, street, off, montrose, avenue, po, box, ss, 6171, nassau, bahamas]"
355,24000356,"#8 GROSVENOR CLOSE, P.O. BOX SS 6502, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,8 grosvenor close po box ss 6502 nassau bahamas,"[8, grosvenor, close, po, box, ss, 6502, nassau, bahamas]"
454,24000455,"RUBY HOUSE 37 WULFF ROAD, P.O. BOX SS 5342, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,ruby house 37 wulff road po box ss 5342 nassau bahamas,"[ruby, house, 37, wulff, road, po, box, ss, 5342, nassau, bahamas]"
788,14047561,"L&L CORPORATE & LEGAL SERVICES, EAST BAY STREET P O BOX SS 19812 NASSAU BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,landl corporate and legal services east bay street po box ss 19812 nassau bahamas,"[landl, corporate, and, legal, services, east, bay, street, po, box, ss, 19812, nassau, bahamas]"
1668,81056994,P.O. Box ss - 5539; Nassau; Bahamas,P.O. Box ss - 5539,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,po box ss 5539 nassau bahamas,"[po, box, ss, 5539, nassau, bahamas]"
1688,81062872,Eastern Road; Sea Spay; Nassau ss+ 19520; Bahamas,Eastern Road,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,eastern road sea spay nassau ss 19520 bahamas,"[eastern, road, sea, spay, nassau, ss, 19520, bahamas]"
1743,81081828,Centre of Commerce; 2nd Floor; 1 Bay Street; PO Box SS 6289; Nassau; Bahamas,Centre of Commerce,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,centre of commerce second floor 1 bay street po box ss 6289 nassau bahamas,"[centre, of, commerce, second, floor, 1, bay, street, po, box, ss, 6289, nassau, bahamas]"
1756,81084980,PO Box SS 5985; 8 Brace Ridge Road; Nassau; New Providence; Bahamas,PO Box SS 5985,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,po box ss 5985 8 brace ridge road nassau new providence bahamas,"[po, box, ss, 5985, 8, brace, ridge, road, nassau, new, providence, bahamas]"




as


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
585,14028452,as trustees of the Wave Trust - Charlotte House; Charlotte Street; P.O. Box N-65; Nassau,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,as trustees of the wavenue trust charlotte house charlotte street po box n65 nassau,"[as, trustees, of, the, wavenue, trust, charlotte, house, charlotte, street, po, box, n65, nassau]"
773,14043947,Guernsey as trustees of the Archon Trust (Bahamas),,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,guernsey as trustees of the archon trust bahamas,"[guernsey, as, trustees, of, the, archon, trust, bahamas]"
778,14045483,Internally Registered as Holder: DITEL; INC.; registered address: Saffrey Square; Suite 205; Bank Lane; P.O. Box # 8188; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,internally registered as holder ditel inc registered address saffrey square suite 205 bank lane po box 8188 nassau bahamas,"[internally, registered, as, holder, ditel, inc, registered, address, saffrey, square, suite, 205, bank, lane, po, box, 8188, nassau, bahamas]"
1193,14087509,(THE PRIVATE CORPORATION LIMITED AS TRUSTEE OF THE MANTOLINE TRUST)Charlotte House; Charlotte Street; P.O. Box N-65; Nassau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the private corporation limited as trustee of the mantoline trustcharlotte house charlotte street po box n65 nassau bahamas,"[the, private, corporation, limited, as, trustee, of, the, mantoline, trustcharlotte, house, charlotte, street, po, box, n65, nassau, bahamas]"
1580,81028921,As Trustee of The Tuleu Family Settlement; 3 Floor; Scotiabank Building; Rawson Square; Nassau; Bahamas,As Trustee of The Tuleu Family Settlement,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,as trustee of the tuleu family settlement 3 floor scotiabank building rawson square nassau bahamas,"[as, trustee, of, the, tuleu, family, settlement, 3, floor, scotiabank, building, rawson, square, nassau, bahamas]"




n


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
9,24000010,"GROUND FLOOR, GOODMAN'S BAY CORPORATE CE, P.O. BOX N 3933, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,ground floor goodmans bay corporate ce po box n 3933 nassau bahamas,"[ground, floor, goodmans, bay, corporate, ce, po, box, n, 3933, nassau, bahamas]"
23,24000024,"P.O. BOX N 529, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n 529 nassau bahamas,"[po, box, n, 529, nassau, bahamas]"
40,24000041,"ONE MONTAGUE PLACE, P.O. BOX. N-4906, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,one montague place po box n 4906 nassau bahamas,"[one, montague, place, po, box, n, 4906, nassau, bahamas]"
78,24000079,"#10 PETRONA HOUSE, FOWLER ST. EAST, P.O. BOX N 1375, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,10 petrona house fowler street east po box n 1375 nassau bahamas,"[10, petrona, house, fowler, street, east, po, box, n, 1375, nassau, bahamas]"
82,24000083,"P.O. BOX N 4818, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n 4818 nassau bahamas,"[po, box, n, 4818, nassau, bahamas]"
...,...,...,...,...,...,...,...,...,...,...
2245,240492204,"J.P. MORGAN TRUST COMPANY (BAHAMAS) LIMITED, 2ND FLOOR BAHAMAS FINANCIAL CENTRE, SHIRLEY AND CHARLOTTE STREET, NASSAU, N-4899, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2017,,j p morgan trust company bahamas limited second floor bahamas financial centre shirley and charlotte street nassau n 4899 bahamas,"[j, p, morgan, trust, company, bahamas, limited, second, floor, bahamas, financial, centre, shirley, and, charlotte, street, nassau, n, 4899, bahamas]"
2247,240492217,"J.P. MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 bahamas,"[j, p, morgan, trust, company, bahamas, limited, nassau, n, 4899, bahamas]"
2251,240492292,"J.P. MORGAN TRUST COMPANY (BAHAMAS) LIMITED, 2ND FLOOR BAHAMAS FINANCIAL CENTRE, NASSAU, N-4899, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2017,,j p morgan trust company bahamas limited second floor bahamas financial centre nassau n 4899 bahamas,"[j, p, morgan, trust, company, bahamas, limited, second, floor, bahamas, financial, centre, nassau, n, 4899, bahamas]"
2253,240492525,"J.P.MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, ZH, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 zh bahamas,"[j, p, morgan, trust, company, bahamas, limited, nassau, n, 4899, zh, bahamas]"






In [108]:
test_list = ['suite', 'e', '2', 'union', 'court', 'building', 'elizabeth', 'avenue', 'and', 'shirly', 'street', 'nassan', 'the', 'bahamas']

In [109]:
test_list[:-1]

['suite',
 'e',
 '2',
 'union',
 'court',
 'building',
 'elizabeth',
 'avenue',
 'and',
 'shirly',
 'street',
 'nassan',
 'the']

In [113]:
df[df['address_wordlist'].apply(lambda x: 'bah' in x)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist
495,24000496,"SHIRLEY & CHARLOTTE STS BAH. FIN. CENTRE, P.O. BOX SS-6373, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,shirley and charlotte street bah fin centre po box ss6373 nassau bahamas,"[shirley, and, charlotte, street, bah, fin, centre, po, box, ss6373, nassau, bahamas]"
760,14043538,GOODMAN S BAY CORPORATE CENTER WEST BAY STREET NASSAU BAH,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,goodman s bay corporate center west bay street nassau bah,"[goodman, s, bay, corporate, center, west, bay, street, nassau, bah]"


In [120]:
pd.set_option('display.max_rows', 70)

In [121]:
df['address_wordlist'].apply(lambda x: x[-1]).value_counts()

bahamas          2040
bahama             66
nassau             54
freeport            5
street              5
bahmas              5
bahams              5
i                   4
providence          3
nassaubahamas       3
centre              3
lane                3
343                 3
kln                 2
abaco               2
bahamas1            2
bhs                 2
kelty               2
2423527291          2
prat                1
lotmore             1
bay                 1
esquare             1
massa               1
charlotte           1
n3023               1
co                  1
cb11323             1
isle                1
bahamasa            1
sandyport           1
abahamas            1
bahamas6            1
cay                 1
44311               1
cb1911              1
building            1
99001               1
23                  1
eleuthera           1
49539               1
bahamaas            1
12062001            1
square              1
ss5539              1
n10700    

In [124]:
df.loc[df['working_address'].str.contains('ba?h\w+s\w?\d?$'), 'address_wordlist'].apply(lambda x: x[-1]).value_counts()

bahamas          1954
bahams              5
bahmas              5
nassaubahamas       3
bahamas1            2
bhamas              1
bahamasc            1
bahaams             1
bahamaas            1
bahanas             1
bahamas6            1
bahamasa            1
abahamas            1
Name: address_wordlist, dtype: int64

In [None]:
def get_set_ratio(row, str_to_match):
    name = row['address']
    return fuzz.token_set_ratio(name, str_to_match)

def get_sort_ratio(row, str_to_match):
    name = row['address']
    return fuzz.token_sort_ratio(name, str_to_match)

In [None]:
u_addresses = df['working_address'].unique()
for u_address in u_addresses:
    df['score'] = df.apply(get_sort_ratio, args=(u_address,), axis=1)
    match_df = sm_df[sm_df['score']>80].sort_values('score', ascending=False)
    if match_df.shape[0]>1:
        print(u_address)
        display(match_df)
        print('\n')

# Fuzzy Resources

- [Fuzzing matching in pandas with fuzzywuzzy](https://jonathansoma.com/lede/algorithms-2017/classes/fuzziness-matplotlib/fuzzing-matching-in-pandas-with-fuzzywuzzy/)
- [Best Libraries for Fuzzy Matching In Python](https://medium.com/codex/best-libraries-for-fuzzy-matching-in-python-cbb3e0ef87dd)
- [Fuzzy String Matching](https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe)
- [Fuzzy String Comparison](https://stackoverflow.com/a/28467760)
- [How to do Fuzzy Matching on Pandas Dataframe Column Using Python?](https://www.geeksforgeeks.org/how-to-do-fuzzy-matching-on-pandas-dataframe-column-using-python/)
- []()