# NLP 08: Parse with Fuzzy Matches

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
import sys
import os
import io

import string
import re
# import itertools
# import nltk
# nltk.download('stopwords')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz as rfuzz
import jaro

In [2]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 0
        freq_dict[ngram] +=1
    return freq_dict

def calc_fuzz_df(df, column):
    row_list = []
    
    for o_i, o_v in enumerate(df[column].sort_index()):
        for m_i, m_v in enumerate(df[column].sort_index()):
            if o_i != m_i:
                dict1 = {
                    'original_index': o_i,
                    'original_value': o_v,
                    'match_index': m_i,
                    'match_value': m_v,
                    'ratio_score': rfuzz.ratio(o_v, m_v),
                    'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
                    'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
                    'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
                    'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
                }
                if (dict1['ratio_score']>60) | (dict1['partial_ratio_score']>60) | (dict1['token_sort_score']>60) | (dict1['token_set_score']>60) | (dict1['jaro_winkler_score']>0.6):
                    row_list.append(dict1)
    score_df = pd.DataFrame(row_list)
        
    return score_df

In [3]:
df = pd.read_csv('data/parsed_bahamas_addresses.csv')

df['address_wordlist'] = df['working_address'].fillna('').str.split()

freq_df = pd.DataFrame.from_dict(
    frequency_ct(df['address_wordlist'].sum()
                ), orient='index').reset_index().rename(
    columns={'index':'word', 0:'count'}).sort_values('count', ascending=False)

In [4]:
freq_df.head(10)

Unnamed: 0,word,count
9,bahamas,2324
8,nassau,2043
6,box,1484
5,po,1430
4,street,1128
2,and,627
3,shirley,489
10,suite,447
34,bay,431
14,building,329


In [5]:
fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
fuzzy_words_df['jaro_winkler_score'] = fuzzy_words_df['jaro_winkler_score']*100

#### Cities

- Nassau
- Freeport
- Marsh Harbour
- Governor's Harbour
- Windermere island
- Harbour Island
- Elbow Cay
- Treasure Cay
- Gregory Town
- Spanish Wells

#### Islands

- New Providence
- Paradise Island
- Grand Bahama
- Abaco
- Eleuthera
- South Andros

In [6]:
df['address_wordlist'].str[-1].value_counts()[:20]

bahamas       2043
bahama          66
nassau          54
freeport         5
street           5
bahmas           5
bahams           5
i                4
providence       3
centre           3
343              3
lane             3
kelty            2
bahamas1         2
kln              2
2423527291       2
bhs              2
abaco            2
isle             1
esquare          1
Name: address_wordlist, dtype: int64

## Bahamas

Remove "bahamas" and the misspellings from the last word in the word list. I'm removing the last word only so that I don't accidentally take "bahamas" out of phrases like "Bahamas Financial Centre." I want to try to separate out such building names later.

This lists the "bahamas" matches from the fuzzy matching

In [7]:
bah_fuzz_list = fuzzy_words_df[(fuzzy_words_df['original_value']=='bahamas') & (fuzzy_words_df['ratio_score']>75)].sort_values(['ratio_score', 'original_value'], ascending=False)['match_value'].to_list()
bah_fuzz_list

['bahamasc',
 'bahamas1',
 'bahamaas',
 'bahamas6',
 'bahamasa',
 'abahamas',
 'bahama',
 'bahams',
 'bhamas',
 'bahmas',
 'ahamas',
 'bahamaspo',
 'bahaams',
 'bahanas',
 'brahmas',
 'baham',
 'hamas']

These are the matches I can get with a regular expression on the last word of the address word list.

In [8]:
bah_reg_list = df.loc[df['working_address'].str.contains(r'ba?h\w+s\w?\d?$', regex=True), 'address_wordlist'].str[-1].unique().tolist()
bah_reg_list

['bahamas',
 'bahams',
 'bhamas',
 'bahmas',
 'bahamasc',
 'bahamas1',
 'bahaams',
 'bahamaas',
 'bahanas',
 'bahamas6',
 'bahamasa',
 'abahamas']

One thing to keep in mind is that the fuzzy matches are from all unique words, which means not all of them are necessarily going to be present in the last word.

In [9]:
set(bah_fuzz_list) - set(bah_reg_list)

{'ahamas', 'baham', 'bahama', 'bahamaspo', 'brahmas', 'hamas'}

In [10]:
set(bah_reg_list) - set(bah_fuzz_list) 

{'bahamas'}

In [11]:
pd.set_option('display.max_colwidth', 1000)

In [12]:
df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in bah_reg_list else x)

0                                                                                    [annex, frederick, and, shirley, street, po, box, n4805, nassau]
1                                                                                         [suite, e2, union, court, building, po, box, n8188, nassau]
2                                                                                           [lyford, cay, house, lyford, cay, po, box, n7785, nassau]
3                                               [po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]
4                                                                             [lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]
                                                                            ...                                                                      
2253                                                                            [j, p, morgan, trust

In [13]:
df['address_country'] = df['address_wordlist'].apply(lambda x:'bahamas' if x[-1] in bah_reg_list else np.nan)
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x[:-1] if x[-1] in bah_reg_list else x)

In [14]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805, nassau]",bahamas
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188, nassau]",bahamas
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785, nassau]",bahamas
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street, nassau]",bahamas
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024, nassau]",bahamas


## The

"the" shows up a surprising amount. It probably isn't relevant, but I prefer to double check.

In [15]:
df['address_wordlist'].str[-1].value_counts()

nassau        1815
providence      78
bahama          76
the             69
freeport        16
              ... 
n08188           1
square           1
islands          1
ba               1
cb12399          1
Name: address_wordlist, Length: 110, dtype: int64

I can see that when "the" is the second to last word it follows "bahamas" as in "the bahamas." As such, I can safely remove it.

In [16]:
pd.set_option('display.max_rows', 70)

In [17]:
df['working_address'].str.split().apply(lambda x: x[-2:]).value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[nassau, bahamas]        1743
[providence, bahamas]      74
[the, bahamas]             68
[grand, bahama]            54
[abaco, bahamas]           13
                         ... 
[n, 8188]                   1
[nassau, bahaams]           1
[343nassau, bahamas]        1
[generals, dept]            1
[cb12399, bahamas]          1
Name: working_address, Length: 176, dtype: int64

After removing the "bahamas" like values, there is now at least one empty list. From here on out I'll need to account for empty lists.

In [18]:
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x if not x else x[:-1] if x[-1] in ['the'] else x)

In [19]:
df['address_wordlist'].str[-1].value_counts()

nassau        1854
providence      89
bahama          77
freeport        16
abaco           15
              ... 
n08188           1
square           1
islands          1
ba               1
cb12399          1
Name: address_wordlist, Length: 111, dtype: int64

## Nassau

In [20]:
fuzzy_words_df[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>70)].sort_values(['ratio_score', 'match_value'], ascending=False)

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
1236,8,nassau,1593,naussau,92.307692,83.333333,92.307692,92.307692,96.190476
1188,8,nassau,698,nasssau,92.307692,83.333333,92.307692,92.307692,97.142857
1220,8,nassau,1224,nassaus,92.307692,100.0,92.307692,92.307692,97.142857
1198,8,nassau,925,nassaub,92.307692,100.0,92.307692,92.307692,97.142857
1214,8,nassau,1160,nassaau,92.307692,90.909091,92.307692,92.307692,97.142857
1216,8,nassau,1194,nassu,90.909091,88.888889,90.909091,90.909091,96.666667
1196,8,nassau,872,nasau,90.909091,80.0,90.909091,90.909091,96.111111
1212,8,nassau,1146,nassua,83.333333,90.909091,83.333333,83.333333,96.666667
1215,8,nassau,1185,nassao,83.333333,90.909091,83.333333,83.333333,93.333333
1217,8,nassau,1201,nassan,83.333333,90.909091,83.333333,83.333333,93.333333


In [21]:
nas_ratio_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>70), 'match_value'].to_list()
nas_jaro_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['jaro_winkler_score']>85), 'match_value'].to_list()
pd.DataFrame([nas_ratio_list, nas_jaro_list]).T

Unnamed: 0,0,1
0,nasssau,nasssau
1,nasau,nasau
2,nassaub,nassaub
3,343nassau,343nassau
4,massau,massau
5,nassua,nassua
6,nassaau,nassaau
7,nassao,nassao
8,nassu,nassu
9,nassan,nassan


### Setting the threshold

The options above 79 look good for the `ratio` look good. In examining the values in the data, I see that "massa" is actually an address in Italy and I'll need to separate "343nassau" into two parts to retain the rest of the address.

In [22]:
for word in nas_ratio_list:
    print(word)
    display(df[df['address_wordlist'].apply(lambda x: word in x)])
    print('\n')

nasssau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
420,24000421,"3RD FLOOR, GEORGE HOUSE, GEORGE STREET, P.O. BOX N-8159 NASSSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,third floor george house george street po box n8159 nasssau bahamas,"[third, floor, george, house, george, street, po, box, n8159, nasssau]",bahamas




nasau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
536,14000678,"101 East Hill Street, Nasau Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,101 east hill street nasau bahamas,"[101, east, hill, street, nasau]",bahamas
612,14030188,Bahamas Financial Centre; Shirley & Charlotte Street; Fourth Flor Nasau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,bahamas financial centre shirley and charlotte street fourth flor nasau bahamas,"[bahamas, financial, centre, shirley, and, charlotte, street, fourth, flor, nasau]",bahamas
682,14035228,"CB 11-343 Nasau, Bahamas",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343 nasau bahamas,"[cb, 11, 343, nasau]",bahamas
724,14038327,Elizabeth Avenue and Shirley Street; Union Court Building; Suite E-2; N-8188; Nasau; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,elizabeth avenue and shirley street union court building suite e 2 n 8188 nasau bahamas,"[elizabeth, avenue, and, shirley, street, union, court, building, suite, e, 2, n, 8188, nasau]",bahamas
965,14078960,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nasau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nasau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nasau]",bahamas
1440,239867,"UBS Trustees (Bahamas) Ltd, UBS House, East Bay Street, P. O. Box N-7757, Nasau, Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,ubs trustees bahamas ltd ubs house east bay street p o box n7757 nasau bahamas,"[ubs, trustees, bahamas, ltd, ubs, house, east, bay, street, p, o, box, n7757, nasau]",bahamas




nassaub


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
583,14026897,"ANSBACHER (BAHAMAS) LIMITED P.O. BOX N 7768 ANSBACHER HOUSE BANK LANE NASSAUB, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,ansbacher bahamas limited po box n 7768 ansbacher house bank lane nassaub bahamas,"[ansbacher, bahamas, limited, po, box, n, 7768, ansbacher, house, bank, lane, nassaub]",bahamas




343nassau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
681,14035227,CB 11.343/Nassau Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,cb 11 343nassau bahamas,"[cb, 11, 343nassau]",bahamas




massau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
810,14049672,"MASSAU, BAHAMAS",,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,massau bahamas,[massau],bahamas




nassua


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
933,14077075,SAFFREY SQUARE; SUITE 205; BANK LANE; P.O. BOX N-8188; NASSUA; BAHAMAS,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,saffrey square suite 205 bank lane po box n8188 nassua bahamas,"[saffrey, square, suite, 205, bank, lane, po, box, n8188, nassua]",bahamas
969,14078964,Suite 102; Saffrey Square; Bay Street and Bank Lane; Nassua; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassua the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassua]",bahamas




nassaau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
966,14078961,Suite 102; Saffrey Square; Bay Street and Bank Lane Nassaau; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite 102 saffrey square bay street and bank lane nassaau the bahamas,"[suite, 102, saffrey, square, bay, street, and, bank, lane, nassaau]",bahamas




nassao


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
999,14079956,Suite E-2; Union Court Buiding; Elizabeth Avenue and Shirley Streer; P.O. Box N-8188; Nassao; Bahamas.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e2 union court buiding elizabeth avenue and shirley streer po box n8188 nassao bahamas,"[suite, e2, union, court, buiding, elizabeth, avenue, and, shirley, streer, po, box, n8188, nassao]",bahamas




nassu


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1021,14079979,Suite E-2; Union Court Building; Elizabeth Avenue & Shirley Street; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street nassu bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, nassu]",bahamas
1117,14080667,The Bahamas Financial Centre; Shirley and Charlotte Streets; P.O. Box N-3023; Nassu; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassu bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassu]",bahamas




nassan


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1043,14080003,Suite E-2; Union Court Building; Elizabeth Avenue and Shirley Street; PO Box 8188; Nassan; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirley street po box 8188 nassan bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirley, street, po, box, 8188, nassan]",bahamas
1050,14080011,Suite E-2; Union Court Building; Elizabeth Avenue and Shirly Street; Nassan; The Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,suite e 2 union court building elizabeth avenue and shirly street nassan the bahamas,"[suite, e, 2, union, court, building, elizabeth, avenue, and, shirly, street, nassan]",bahamas




nassaus


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1106,14080656,The Bahamas Financial Centre; Shirley & Charlotte Streets; PO Box N-3023; Nassaus; Bahamas,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,the bahamas financial centre shirley and charlotte street po box n3023 nassaus bahamas,"[the, bahamas, financial, centre, shirley, and, charlotte, street, po, box, n3023, nassaus]",bahamas




massa


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1150,14083823,VIA BIGINI; 43; I-54100 MASSA,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,via bigini 43 i 54100 massa,"[via, bigini, 43, i, 54100, massa]",




naussau


Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country
1428,252371,"43 Elizabeth Avenue, P.O.Box CB-13022 Naussau Bahamas",,Bahamas,BHS,Offshore Leaks,The Offshore Leaks data is current through 2010,,43 elizabeth avenue po box cb13022 naussau bahamas,"[43, elizabeth, avenue, po, box, cb13022, naussau]",bahamas






I went back and updated the original standardization. I left the example where I found it.

In [23]:
df.loc[df['working_address'].str.contains("343nass"), 'working_address'] = 'cb 11 343 nassau bahamas'

In [24]:
nas_fuzz_list = fuzzy_words_df.loc[(fuzzy_words_df['original_value']=='nassau') & (fuzzy_words_df['ratio_score']>80), 'match_value'].to_list()
nas_fuzz_list = nas_fuzz_list + ['nassau']
nas_fuzz_list

['nasssau',
 'nasau',
 'nassaub',
 'massau',
 'nassua',
 'nassaau',
 'nassao',
 'nassu',
 'nassan',
 'nassaus',
 'naussau',
 'nassau']

In [25]:
df['address_city'] = df['address_wordlist'].apply(lambda x: np.nan if not x else 'nassau' if x[-1] in nas_fuzz_list else np.nan)
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x if not x else x[:-1] if x[-1] in nas_fuzz_list else x)

In [26]:
df[df['address_city']=='nassau']

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,"[annex, frederick, and, shirley, street, po, box, n4805]",bahamas,nassau
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,"[suite, e2, union, court, building, po, box, n8188]",bahamas,nassau
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,"[lyford, cay, house, lyford, cay, po, box, n7785]",bahamas,nassau
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,"[po, box, n3708, bahamas, financial, centre, po, box, n3708, shirley, and, charlotte, street]",bahamas,nassau
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,"[lyford, cay, house, third, floor, lyford, cay, po, box, n3024]",bahamas,nassau
...,...,...,...,...,...,...,...,...,...,...,...,...
2226,240491372,"MONTAGUE STERLING CENTRE, EAST BAY STREET, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montague sterling centre east bay street nassau bahamas,"[montague, sterling, centre, east, bay, street]",bahamas,nassau
2227,240491474,"SUITE 200B, 2ND FLOOR, CENTRE OF COMMERCE, ONE BAY STREET, PO BOX N-3944, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,suite 200b second floor centre of commerce one bay street po box n3944 nassau bahamas,"[suite, 200b, second, floor, centre, of, commerce, one, bay, street, po, box, n3944]",bahamas,nassau
2229,240491518,"RBC TRUST COMPANY (BAHAMAS) LIMITED, BAYSIDE EXECUTIVE PARK BUILDING 3, P.O. BOX NO. 30-24, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,rbc trust company bahamas limited bayside executive park building 3 po box no 30 24 nassau bahamas,"[rbc, trust, company, bahamas, limited, bayside, executive, park, building, 3, po, box, no, 30, 24]",bahamas,nassau
2248,240492221,"JPMORGAN TRUST COMPANY (BAHAMAS) LIMITED, 2ND FLOOR, BAHAMAS FINANCIAL CENTRE, SHIRLEY AND CHARLOTTE STREET, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2017,,jpmorgan trust company bahamas limited second floor bahamas financial centre shirley and charlotte street nassau bahamas,"[jpmorgan, trust, company, bahamas, limited, second, floor, bahamas, financial, centre, shirley, and, charlotte, street]",bahamas,nassau


## Next step

The value counts for the last word are now heavily weighted toward PO Box numbers. "n8188" and "8188" refer to the same PO Box. I believe the "cb" and "ss" entries are also PO Boxes.

I'm going to join the working address back into a single string and pull out the PO Boxes.

In [27]:
df['address_wordlist'].str[-1].value_counts()

street        434
n8188         109
providence     93
bahama         77
8188           54
             ... 
cb11103         1
ss19772         1
1462            1
cb13908         1
zh              1
Name: address_wordlist, Length: 665, dtype: int64

In [28]:
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: '' if not x else ' '.join(x))
df

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,annex frederick and shirley street po box n4805,bahamas,nassau
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,suite e2 union court building po box n8188,bahamas,nassau
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,lyford cay house lyford cay po box n7785,bahamas,nassau
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street,bahamas,nassau
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,lyford cay house third floor lyford cay po box n3024,bahamas,nassau
...,...,...,...,...,...,...,...,...,...,...,...,...
2253,240492525,"J.P.MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, ZH, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 zh bahamas,j p morgan trust company bahamas limited nassau n 4899 zh,bahamas,
2254,240492536,"MONTAGNE STERLINE CENTRE. EAST BAV STREET, NASSAU, COUNTRY BAHAMAS, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montagne sterline centre east bav street nassau country bahamas bahamas,montagne sterline centre east bav street nassau country bahamas,bahamas,
2255,240491733,"DELTEC HOUSE, LYFORD CAY, PO BOX N-3229, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,deltec house lyford cay po box n3229 nassau bahamas,deltec house lyford cay po box n3229,bahamas,nassau
2256,240491778,"PROVIDENCE HOUSE, HAST WING, EAST HILL STREET, P.O. BOX CB-12399, NASSAU, CB-12399, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,providence house hast wing east hill street po box cb12399 nassau cb12399 bahamas,providence house hast wing east hill street po box cb12399 nassau cb12399,bahamas,


In [30]:
df[df['address_wordlist'].str.contains('box', regex=True)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,annex frederick and shirley street po box n4805,bahamas,nassau
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,suite e2 union court building po box n8188,bahamas,nassau
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,lyford cay house lyford cay po box n7785,bahamas,nassau
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street,bahamas,nassau
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,lyford cay house third floor lyford cay po box n3024,bahamas,nassau
...,...,...,...,...,...,...,...,...,...,...,...,...
2225,240491356,"P.O. BOX N- 3944, SUITE 200B, 2ND FLOOR, CENTRE OF COMMERCE, ONE BAY STREET, NASSAU, BAHAMAS, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2017,,po box n 3944 suite 200b second floor centre of commerce one bay street nassau bahamas nassau bahamas,po box n 3944 suite 200b second floor centre of commerce one bay street nassau bahamas,bahamas,nassau
2227,240491474,"SUITE 200B, 2ND FLOOR, CENTRE OF COMMERCE, ONE BAY STREET, PO BOX N-3944, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,suite 200b second floor centre of commerce one bay street po box n3944 nassau bahamas,suite 200b second floor centre of commerce one bay street po box n3944,bahamas,nassau
2229,240491518,"RBC TRUST COMPANY (BAHAMAS) LIMITED, BAYSIDE EXECUTIVE PARK BUILDING 3, P.O. BOX NO. 30-24, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,rbc trust company bahamas limited bayside executive park building 3 po box no 30 24 nassau bahamas,rbc trust company bahamas limited bayside executive park building 3 po box no 30 24,bahamas,nassau
2255,240491733,"DELTEC HOUSE, LYFORD CAY, PO BOX N-3229, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,deltec house lyford cay po box n3229 nassau bahamas,deltec house lyford cay po box n3229,bahamas,nassau


In [47]:
df.loc[df['address_wordlist'].str.contains('p?o?\s?box\s?\w+\s?\d+', regex=True), 'address_wordlist']

0                                              annex frederick and shirley street po box n4805
1                                                   suite e2 union court building po box n8188
2                                                     lyford cay house lyford cay po box n7785
3              po box n3708 bahamas financial centre po box n3708 shirley and charlotte street
4                                         lyford cay house third floor lyford cay po box n3024
                                                 ...                                          
2225    po box n 3944 suite 200b second floor centre of commerce one bay street nassau bahamas
2227                    suite 200b second floor centre of commerce one bay street po box n3944
2229       rbc trust company bahamas limited bayside executive park building 3 po box no 30 24
2255                                                      deltec house lyford cay po box n3229
2256                 providence house hast wing ea

In [132]:
pattern2 = '(p?o?\s?b?o?x?\s?box\s?\w?\s?\w+\s?\d+)'
df['po_box'] = df['address_wordlist'].str.extract(pattern2)
df['address_wordlist'] = df['address_wordlist'].str.replace(pattern2, '', regex=True)

df['address_wordlist'] = df['address_wordlist'].str.replace('(po box)', '', regex=True)

pattern = r'(\bpo\s.+\s?\d+)'
df.loc[df['address_wordlist'].str.contains(pattern), 'po_box'
      ] = df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'].str.extract(pattern, expand=False)
df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'
      ] = df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'].str.replace(pattern, '', regex=True)

pattern1 = r'(\bpo\s?b\s?\w\s?\d+\b)'
df.loc[df['address_wordlist'].str.contains(pattern1), 'po_box'
      ] = df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'].str.extract(pattern1, expand=False)
df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'
      ] = df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'].str.replace(pattern1, '', regex=True)

  ] = df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'].str.extract(pattern, expand=False)
  df.loc[df['address_wordlist'].str.contains(pattern), 'po_box'
  ] = df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'].str.replace(pattern, '', regex=True)
  df.loc[df['address_wordlist'].str.contains(pattern), 'address_wordlist'
  ] = df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'].str.extract(pattern1, expand=False)
  df.loc[df['address_wordlist'].str.contains(pattern1), 'po_box'
  ] = df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'].str.replace(pattern1, '', regex=True)
  df.loc[df['address_wordlist'].str.contains(pattern1), 'address_wordlist'


In [134]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city,po_box
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,annex frederick and shirley street,bahamas,nassau,po box n4805
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,suite e2 union court building,bahamas,nassau,po box n8188
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,lyford cay house lyford cay,bahamas,nassau,po box n7785
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,bahamas financial centre shirley and charlotte street,bahamas,nassau,po box n3708
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,lyford cay house third floor lyford cay,bahamas,nassau,po box n3024


## Cities and islands

#### Cities

- [x] Nassau
- [x] Freeport
- Marsh Harbour
- Governor's Harbour
- Windermere island
- Harbour Island
- Elbow Cay
- Treasure Cay
- Gregory Town
- Spanish Wells

#### Islands

- New Providence
- Paradise Island
- Grand Bahama
- Abaco
- Eleuthera
- South Andros

In [140]:
df.loc[df['address_wordlist'].str.contains(r'(\bfreeport\b)'), ['address_city', 'address_island']] = ['freeport', 'grand bahama']
df['address_wordlist'] = df['address_wordlist'].str.replace('(freeport)', '', regex=True)

gr_bah_pat = r'\bgr\w*\sbah\w+\b'
df.loc[df['working_address'].str.contains(gr_bah_pat), 'address_island'] = 'grand bahama'
df.loc[df['working_address'].str.contains(gr_bah_pat), 'address_wordlist'
      ] = df.loc[df['working_address'].str.contains(gr_bah_pat), 'address_wordlist'].str.replace(gr_bah_pat, '', regex=True)
df.loc[df['working_address'].str.contains(gr_bah_pat), 'address_wordlist'
      ] = df.loc[df['working_address'].str.contains(gr_bah_pat), 'address_wordlist'].str.replace('grand', '', regex=True)

  df.loc[df['address_wordlist'].str.contains(r'(\bfreeport\b)'), ['address_city', 'address_island']] = ['freeport', 'grand bahama']


In [164]:
df[df['working_address'].str.contains(r'\bnp\b', regex=True)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city,po_box,address_island
163,24000164,"LYFORD MANOR, WEST BUILDING, WEST BAY ST, P.O. BOX CB-13007, NASSAU, NP, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford manor west building west bay street po box cb13007 nassau np bahamas,lyford manor west building west bay street nassau np,bahamas,,po box cb13007,
517,24000518,"3RD FL. BRITISH COLONIAL CENTRE OF COMM, SUITE 304, 1 BAY STREET, SP 63776, NASSAU, NP, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,third fl british colonial centre of comm suite 304 1 bay street sp 63776 nassau np bahamas,third fl british colonial centre of comm suite 304 1 bay street sp 63776 nassau np,bahamas,,,
1595,81031897,Sassoon House; Shirley St and Victoria Ave; Nassau; NP; Bahamas,Sassoon House,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,sassoon house shirley street and victoria avenue nassau np bahamas,sassoon house shirley street and victoria avenue nassau np,bahamas,,,
1612,81036376,Stanley Eastern Road; P.O. Box 55-5539; Nassau; NP; Bahamas,Stanley Eastern Road,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,stanley eastern road po box 555539 nassau np bahamas,stanley eastern road nassau np,bahamas,,po box 555539,
1623,81039098,Villa Capulet; Montague Foreshore; PO Box N-8893; Nassau NP; Bahamas,Villa Capulet,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,villa capulet montague foreshore po box n8893 nassau np bahamas,villa capulet montague foreshore nassau np,bahamas,,po box n8893,
1733,81080265,Apartment 2; Coral Harbor; New Providence; NP CB115-21; Bahamas,Apartment 2,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,apartment 2 coral harbor new providence np cb115 21 bahamas,apartment 2 coral harbor new providence np cb115 21,bahamas,,,
1739,81080985,Old Fort Bay; P.O. Box N865; NP N 865 Nassau; New Providence; Bahamas,Old Fort Bay,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,old fort bay po box n865 np n 865 nassau new providence bahamas,old fort bay np n 865 nassau new providence,bahamas,,po box n865,
1770,81004778,PTC Management Limited; 2nd Floor ; Charlotte House; Charlotte Street; Nassau NP; Bahamas,PTC Management Limited,Bahamas,BHS,Paradise Papers - Appleby,Appleby data is current through 2014,,ptc management limited second floor charlotte house charlotte street nassau np bahamas,ptc management limited second floor charlotte house charlotte street nassau np,bahamas,,,
1923,33000119,"LYFORD MANOR, WEST BUILDING, WEST BAY ST PO BOX CB-13007, NASSAU, NP, BAHAMAS",,Bahamas,BHS,Paradise Papers - Bahamas corporate registry,Bahamas corporate registry data is current through 2016,,lyford manor west building west bay street po box cb13007 nassau np bahamas,lyford manor west building west bay street nassau np,bahamas,,po box cb13007,
2171,240070266,"OLD FORT BAY, P.O. BOX N7776, NEW PROVIDENCE NP NASSAU BAHAMAS",,Bahamas,BHS,"Pandora Papers - Alemán, Cordero, Galindo & Lee (Alcogal)",Provider data is current through 2017,,old fort bay po box n7776 new providence np nassau bahamas,old fort bay new providence np,bahamas,nassau,po box n7776,


In [163]:
df[df['working_address'].str.contains(r'\bnew\s?pr\w+\b', regex=True)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city,po_box,address_island
183,24000184,"SUITE 1000 NEW PROVIDENCE FINANCIAL CTR., P.O. BOX CR-56766, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite 1000 new providence financial ctr po box cr56766 nassau bahamas,suite 1000 new providence financial ctr,bahamas,nassau,po box cr56766,
235,24000236,"#4 PINEAPPLE GROVE,OLD FORT BAY, NEW PRODIVENCE, BOX SP-60063, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,4 pineapple grove old fort bay new prodivence box sp60063 nassau bahamas,4 pineapple grove old fort bay new prodivence,bahamas,nassau,box sp60063,
457,24000458,"P.O. BOX N-10620, NASSAU, NEW PROVIDENCE",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n10620 nassau new providence,nassau new providence,,,po box n10620,
474,24000475,"P.O. BOX CB-13265, NASSAU. NEW PROVIDENCE",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box cb13265 nassau new providence,nassau new providence,,,po box cb13265,
530,14000073,#1 Venetian Villa N492; Old Fort Day; Nassau; New Providence; Bahamas.,,Bahamas,BHS,Panama Papers,The Panama Papers data is current through 2015,,1 venetian villa n492 old fort day nassau new providence bahamas,1 venetian villa n492 old fort day nassau new providence,bahamas,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2206,240112487,"MILLENIUM, NASSAU, NEW PROVIDENCE, BAHAMAS",,Bahamas,BHS,Pandora Papers - Alpha Consulting,Provider data is current through 2018,,millenium nassau new providence bahamas,millenium nassau new providence,bahamas,,,
2218,240490183,"5 DUNMORE ISLAND, OLD FORT BAY, NASSAU, NEW PROVIDENCE, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2017,,5 dunmore island old fort bay nassau new providence bahamas,5 dunmore island old fort bay nassau new providence,bahamas,,,
2220,240452852,"P O BOX CR56766 STE 875 — NASSAU, BAHAMAS — NEW PROVIDENCE NP CR567-66 - BAHAMAS",,Bahamas,BHS,Pandora Papers - SFM Corporate Services,Provider data is current through 2012,,po box cr56766 ste 875 nassau bahamas new providence np cr567 66 bahamas,ste 875 nassau bahamas new providence np cr567 66,bahamas,,po box cr56766,
2231,240492015,"JPMORGAN TRUST COMPANY (BAHAMAS) LIMITED, 2ND FLOOR, BAHAMAS FINANCIAL CENTRE, SHIRLEY AND CHARLOTTE STREET, NASSAU, N-4899, NEW PROVIDENCE, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,jpmorgan trust company bahamas limited second floor bahamas financial centre shirley and charlotte street nassau n 4899 new providence bahamas,jpmorgan trust company bahamas limited second floor bahamas financial centre shirley and charlotte street nassau n 4899 new providence,bahamas,,,


In [162]:
df[df['address'].str.lower().str.contains('grand island', regex=True)]

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city,po_box,address_island


In [156]:
df

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address,address_wordlist,address_country,address_city,po_box,address_island
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas,annex frederick and shirley street,bahamas,nassau,po box n4805,
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2 union court building po box n8188 nassau bahamas,suite e2 union court building,bahamas,nassau,po box n8188,
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas,lyford cay house lyford cay,bahamas,nassau,po box n7785,
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas,bahamas financial centre shirley and charlotte street,bahamas,nassau,po box n3708,
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas,lyford cay house third floor lyford cay,bahamas,nassau,po box n3024,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2253,240492525,"J.P.MORGAN TRUST COMPANY (BAHAMAS) LIMITED, NASSAU, N-4899, ZH, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,j p morgan trust company bahamas limited nassau n 4899 zh bahamas,j p morgan trust company bahamas limited nassau n 4899 zh,bahamas,,,
2254,240492536,"MONTAGNE STERLINE CENTRE. EAST BAV STREET, NASSAU, COUNTRY BAHAMAS, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,montagne sterline centre east bav street nassau country bahamas bahamas,montagne sterline centre east bav street nassau country bahamas,bahamas,,,
2255,240491733,"DELTEC HOUSE, LYFORD CAY, PO BOX N-3229, NASSAU, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,deltec house lyford cay po box n3229 nassau bahamas,deltec house lyford cay,bahamas,nassau,po box n3229,
2256,240491778,"PROVIDENCE HOUSE, HAST WING, EAST HILL STREET, P.O. BOX CB-12399, NASSAU, CB-12399, BAHAMAS",,Bahamas,BHS,Pandora Papers - Trident Trust,Provider data is current through 2016,,providence house hast wing east hill street po box cb12399 nassau cb12399 bahamas,providence house hast wing east hill street nassau cb12399,bahamas,,po box cb12399,


In [47]:
df['address_wordlist'] = df['address_wordlist'].apply(lambda x: x if not x else x[:-1] if x[-1] in ['the'] else x)