In [1]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein
# !pip install rapidfuzz
# !pip install jaro-winkler

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
import sys
import os
import io

import string
import re
import itertools
import nltk
nltk.download('stopwords')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz as rfuzz
import jaro

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julie.fisher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [181]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 0
        freq_dict[ngram] +=1
    return freq_dict

In [36]:
df = pd.read_csv('data/parsed_bahamas_addresses.csv')

In [37]:
df.shape

(2258, 9)

In [38]:
df.head()

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note,working_address
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,annex frederick and shirley street po box n4805 nassau bahamas
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,suite e2union court building po box n8188 nassau bahamas
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house lyford cay po box n7785 nassau bahamas
3,24000004,"P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,po box n3708 bahamas financial centre po box n3708 shirley and charlotte street nassau bahamas
4,24000005,"LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through early 2016.,,lyford cay house third floor lyford cay po box n3024 nassau bahamas


In [39]:
df['address_wordlist'] = df['working_address'].fillna('').str.split()

In [17]:
freq_df = pd.DataFrame.from_dict(
    frequency_ct(df['address_wordlist'].sum()
                ), orient='index').reset_index().rename(
    columns={'index':'word', 0:'count'}).sort_values('count', ascending=False)

In [40]:
freq_df.shape

(2091, 2)

In [19]:
freq_df.head(60)

Unnamed: 0,word,count
9,bahamas,2311
8,nassau,2009
6,box,1451
5,po,1402
4,street,1045
2,and,612
3,shirley,477
10,suite,447
33,bay,397
13,building,326


## Fuzzy matching

In [20]:
print('Sample 1:', df['working_address'][0])
print('Sample 2:', df['working_address'][1])

Sample 1: annex frederick and shirley street po box n4805 nassau bahamas
Sample 2: suite e2union court building po box n8188 nassau bahamas


### fuzzywuzzy implimentation

In [21]:
print('Ratio:', fuzz.ratio(df['working_address'][0], df['working_address'][1]))
print('Partial ratio:', fuzz.partial_ratio(df['working_address'][0], df['working_address'][1]))
print('Token sort ratio:', fuzz.token_sort_ratio(df['working_address'][0], df['working_address'][1]))
print('Token set ratio:', fuzz.token_set_ratio(df['working_address'][0], df['working_address'][1]))

Ratio: 56
Partial ratio: 58
Token sort ratio: 56
Token set ratio: 61


In [22]:
print('Ratio:', fuzz.ratio(df['working_address'][1], df['working_address'][0]))
print('Partial ratio:', fuzz.partial_ratio(df['working_address'][1], df['working_address'][0]))
print('Token sort ratio:', fuzz.token_sort_ratio(df['working_address'][1], df['working_address'][0]))
print('Token set ratio:', fuzz.token_set_ratio(df['working_address'][1], df['working_address'][0]))

Ratio: 56
Partial ratio: 58
Token sort ratio: 56
Token set ratio: 61


### rapidfuzz implimentation

In [23]:
print('Ratio:', rfuzz.ratio(df['working_address'][0], df['working_address'][1]))
print('Partial ratio:', rfuzz.partial_ratio(df['working_address'][0], df['working_address'][1]))
print('Token sort ratio:', rfuzz.token_sort_ratio(df['working_address'][0], df['working_address'][1]))
print('Token set ratio:', rfuzz.token_set_ratio(df['working_address'][0], df['working_address'][1]))

Ratio: 55.93220338983051
Partial ratio: 66.66666666666667
Token sort ratio: 55.93220338983051
Token set ratio: 61.016949152542374


In [24]:
print('Ratio:', rfuzz.ratio(df['working_address'][1], df['working_address'][0]))
print('Partial ratio:', rfuzz.partial_ratio(df['working_address'][1], df['working_address'][0]))
print('Token sort ratio:', rfuzz.token_sort_ratio(df['working_address'][1], df['working_address'][0]))
print('Token set ratio:', rfuzz.token_set_ratio(df['working_address'][1], df['working_address'][0]))

Ratio: 55.93220338983051
Partial ratio: 66.66666666666667
Token sort ratio: 55.93220338983051
Token set ratio: 61.016949152542374


### Jaro Winkler

In [25]:
print('Jaro winkler 1:', jaro.jaro_winkler_metric(df['working_address'][0], df['working_address'][1]))
print('Jaro winkler 2:', jaro.jaro_winkler_metric(df['working_address'][1], df['working_address'][0]))

Jaro winkler 1: 0.6821556579621095
Jaro winkler 2: 0.6821556579621095


In [28]:
goodmans_series = df.loc[df['working_address'].str.contains('bay corporate'), 'working_address']
goodmans_series

9                                               ground floor goodmans bay corporate ce po box n 3933 nassau bahamas
63                                               goodmans bay corporate centre west bay po box n3015 nassau bahamas
100                                     goodmans bay corporate centre po box cb10976 west bay street nassau bahamas
116                                           goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas
248                              goodmans bay corporate centre po box ss5498 suite 261 west baystreetnassau bahamas
268                                                     goodmans bay corporate centre po box cb12407 nassau bahamas
548                             second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas
756                                      goodman0s bay corporate centre west bay street po box n4938 nassau bahamas
758                                       goodmans bay corporate center 

In [32]:
goodmans_series.shape

(37,)

In [29]:
goodmans_series.iloc[0]

'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'

### process.extract

In [30]:
process.extract(goodmans_series.iloc[0], goodmans_series, limit=10)

[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('second floor goodmans bay corporate centre', 86, 1197),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  86,
  1800),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  86,
  2133),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 81, 268),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  81,
  548),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 81, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  79,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  79,
  2120)]

In [34]:
for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio, fuzz.token_set_ratio, jaro.jaro_winkler_metric]:
    print(str(scorer))
    display(process.extract(goodmans_series.iloc[0], goodmans_series, scorer=scorer, limit=40))
    print('\n')

<function ratio at 0x7f8bd069f5e0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 78, 2068),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  77,
  548),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 76, 268),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  76,
  1610),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  76,
  1707),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75,
  2120),
 ('goodmans bay corporate cen



<function partial_ratio at 0x7f8bd069f820>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 87, 2068),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 85, 268),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  63),
 ('second floor goodmans bay corporate centre', 83, 1197),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  2120),
 ('goodmans bay corporate centre west bay street  nassau  bahamas', 78, 765),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  77,
  1707),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  76,
  2133),
 ('goodmans bay corporate centre west bay street nassau the bahamas', 75, 766),
 ('co third fl goodmans bay corporate centre west bay street nassau b



<function token_sort_ratio at 0x7f8bd069fa60>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 75, 268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 75, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  74,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  74,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  73,
  1707),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  72,
  548),
 ('goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  71,
  116),
 ('goodmans bay corporate centre third floor west bay street nassau bahamas',
  71,
  762),
 ('goodman0s bay corporate centre west bay street po box n4938 nassau bahamas',
  70,
  756),
 ('goodmans bay corporate center po 



<function token_set_ratio at 0x7f8bd069fca0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas', 85, 268),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  85,
  548),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  85,
  1800),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas', 85, 2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  81,
  1707),
 ('goodmans bay corporate centre west bay street  nassau  bahamas', 80, 765),
 ('second floor goodmans bay corporate centre', 80, 1197),
 ('goodmans bay corporate centre po box cb10976 west



<function jaro_winkler_metric at 0x7f8bd06a9310>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  0.7761178851505186,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  0.775915536971176,
  2068),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1610),
 ('second floor goodmans bay corporate centre', 0.7660186839291317, 1197),
 ('goodmans bay corporate center west bay street po box n4938 nassau bahamas',
  0.7631546664657964,
  1524),
 ('goodman0s bay corporate centre west bay street po box n4938 nassau bahamas',
  0.7599891722044119,
  756),
 ('goodmans b





In [35]:
for scorer in [rfuzz.ratio, rfuzz.partial_ratio, rfuzz.token_sort_ratio, rfuzz.token_set_ratio, jaro.jaro_winkler_metric]:
    print(str(scorer))
    display(process.extract(goodmans_series.iloc[0], goodmans_series, scorer=scorer, limit=40))
    print('\n')

<cyfunction ratio at 0x7f8bd0653860>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  77.77777777777779,
  2068),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  77.33333333333333,
  548),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  76.19047619047619,
  268),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  76.11940298507463,
  1707),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  75.71428571428571,
  1610),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  75.1879699



<cyfunction partial_ratio at 0x7f8bd0653930>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('second floor goodmans bay corporate centre', 87.5, 1197),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  86.72566371681415,
  2068),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  84.95575221238938,
  268),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.33333333333334,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.33333333333334,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  78.125,
  1707),
 ('goodmans bay corporate centre west bay street  nassau  bahamas',
  77.58620689655173,
  765),
 ('cibc trust company bahamas limited first floor goodmans bay corporate centre west bay street nassau bahamas',
  76.11940298507463,
  2133),
 ('co third fl goodmans bay corporate



<cyfunction token_sort_ratio at 0x7f8bd0653ad0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  74.60317460317461,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  74.60317460317461,
  2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  73.6842105263158,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  73.6842105263158,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  73.13432835820896,
  1707),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  72.48322147651007,
  548),
 ('goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  70.58823529411764,
  116),
 ('goodmans bay corporate centre third floor west bay street nassau bahamas',
  70.50359712230217,
  762),




<cyfunction token_set_ratio at 0x7f8bd0653ba0>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  100.0,
  1975),
 ('second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas',
  85.47008547008546,
  548),
 ('co cotswold group goodmans bay corporate centre second floor po box cb 12762 suite 261 nassau bahamas',
  85.47008547008546,
  1800),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  85.4368932038835,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  85.4368932038835,
  2068),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.01886792452831,
  63),
 ('goodmans bay corporate centre west bay po box n3015 nassau bahamas',
  83.01886792452831,
  2120),
 ('first floor goodmans bay corporate centre bay street nassau bahamas',
  81.13207547169812,
  1707),
 ('second floor goodmans bay corporate centre', 80.0, 1197),
 ('goodmans b



<function jaro_winkler_metric at 0x7f8bd06a9310>


[('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  9),
 ('ground floor goodmans bay corporate ce po box n 3933 nassau bahamas',
  1.0,
  1975),
 ('goodmans bay corporate centre po box cb12407 nassau bahamas',
  0.7761178851505186,
  268),
 ('goodmans bay corporate centre po box cb10976 nassau bahamas',
  0.775915536971176,
  2068),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1414),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1511),
 ('goodmans bay corporate centre west bay street po box n3933 nassau bahamas',
  0.774441468071761,
  1610),
 ('second floor goodmans bay corporate centre', 0.7660186839291317, 1197),
 ('goodmans bay corporate center west bay street po box n4938 nassau bahamas',
  0.7631546664657964,
  1524),
 ('goodman0s bay corporate centre west bay street po box n4938 nassau bahamas',
  0.7599891722044119,
  756),
 ('goodmans b





## Solutioning

I'm trying to use fuzzy matching to identify and resolve duplicates. Having not worked with this before, I want to see the match values so that I can determine an appropriate threshold. To do this, I need to come up with a way to process the values, store the data, and analyize it.

### Metrics

The Goodman's Bay Corporate Centre example above returned the following ranges for each metric which make me think that some metrics are better for certain use cases than others:

**Fuzzywuzzy**

- Ratio: 100 - 58
- Partial ratio: 100 - 51
- Token sort: 100 - 60
- Token set: 100 - 66
- Jaro-Winkler: 1.0 - 0.65

**Rapidfuzz**

- Ratio: 100 - 57.8
- Partial ratio: 100 - 58.6
- Token sort: 100 - 59.8
- Token set: 100 - 65.7
- Jaro-Winkler: 1.0 - 0.65

### Storage format

From the Goodman's Bay Corporate Centre results, the relevant information I think I'll need includes:

<table>
    <tr>
        <td>address_index</td>
        <td>address</td>
        <td>match_index</td>
        <td>match</td>
        <td>ratio_score</td>
        <td>partial_ratio_score</td>
        <td>token_sort_score</td>
        <td>token_set_score</td>
        <td>jaro_winkler_score</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1975</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>100</td>
        <td>100</td>
        <td>100</td>
        <td>100</td>
        <td>1.0</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>2068</td>
        <td>'goodmans bay corporate centre po box cb10976 nassau bahamas'</td>
        <td>78</td>
        <td>87</td>
        <td>75</td>
        <td>85</td>
        <td>0.78</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>548</td>
        <td>'second  floor goodmans bay corporate centre suite 261 po box cb12762 nassau bahamas'</td>
        <td>77</td>
        <td>69</td>
        <td>72</td>
        <td>85</td>
        <td>0.74</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>268</td>
        <td>'goodmans bay corporate centre po box cb12407 nassau bahamas'</td>
        <td>76</td>
        <td>85</td>
        <td>75</td>
        <td>85</td>
        <td>0.78</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1511</td>
        <td>'goodmans bay corporate centre west bay street po box n3933 nassau bahamas'</td>
        <td>76</td>
        <td>70</td>
        <td>70</td>
        <td>79</td>
        <td>0.77</td>
    </tr>
    <tr>
        <td>9</td>
        <td>'ground floor goodmans bay corporate ce po box n 3933 nassau bahamas'</td>
        <td>1707</td>
        <td>'first floor goodmans bay corporate centre bay street nassau bahamas'</td>
        <td>76</td>
        <td>77</td>
        <td>73</td>
        <td>81</td>
        <td>0.73</td>
    </tr>
</table>

The next question is: how do I get all of this information into a dataframe?

The `process.extract` function I tried earlier limits on the number of results returned, not the quality of results. With the address datasets I'll be working with, there are potentially large groups of related data (especially when I get into countries with large representation in the original dataset). Because of this, I won't be using the `process.extract` function. I'm going to prototype using a smaller, easier dataset: the word frequency dataset.

In [45]:
freq_df.sort_index()

Unnamed: 0,word,count
0,annex,8
1,frederick,27
2,and,612
3,shirley,477
4,street,1045
...,...,...
2086,sterline,1
2087,bav,1
2088,hast,1
2089,cojp,1


In [68]:
freq_df['word'].sort_index()[:5]

0        annex
1    frederick
2          and
3      shirley
4       street
Name: word, dtype: object

In [57]:
print(f'''Original index: {0}
Original value: {freq_df.iloc[0, 0]}
Match index: {1}
Match value: {freq_df.iloc[1, 0]}
Metric output: {rfuzz.ratio(freq_df.iloc[0, 0], freq_df.iloc[1, 0])}
''')

Original index: 0
Original value: bahamas
Match index: 1
Match value: nassau
Metric output: 30.76923076923077



In [73]:
for o_i, o_v in enumerate(freq_df['word'].sort_index()[:5]):
    for m_i, m_v in enumerate(freq_df['word'].sort_index()[:5]):
        if o_i != m_i:
            print([o_i, o_v, m_i, m_v, rfuzz.ratio(o_v, m_v), rfuzz.partial_ratio(o_v, m_v), rfuzz.token_sort_ratio(o_v, m_v), rfuzz.token_set_ratio(o_v, m_v), jaro.jaro_winkler_metric(o_v, m_v)])
            

[0, 'annex', 1, 'frederick', 14.28571428571429, 25.0, 14.28571428571429, 14.285714285714292, 0.43703703703703706]
[0, 'annex', 2, 'and', 50.0, 80.0, 50.0, 50.0, 0.6888888888888888]
[0, 'annex', 3, 'shirley', 16.666666666666664, 28.57142857142857, 16.666666666666664, 16.66666666666667, 0.44761904761904764]
[0, 'annex', 4, 'street', 18.181818181818176, 28.57142857142857, 18.181818181818176, 18.181818181818187, 0.45555555555555555]
[1, 'frederick', 0, 'annex', 14.28571428571429, 25.0, 14.28571428571429, 14.285714285714292, 0.43703703703703706]
[1, 'frederick', 2, 'and', 16.666666666666664, 33.333333333333336, 16.666666666666664, 16.66666666666667, 0.48148148148148145]
[1, 'frederick', 3, 'shirley', 25.0, 40.0, 25.0, 25.0, 0.5026455026455027]
[1, 'frederick', 4, 'street', 40.0, 54.54545454545454, 40.0, 40.0, 0.611111111111111]
[2, 'and', 0, 'annex', 50.0, 80.0, 50.0, 50.0, 0.6888888888888888]
[2, 'and', 1, 'frederick', 16.666666666666664, 33.333333333333336, 16.666666666666664, 16.66666666

In [93]:
# def calc_fuzz_df(df, column):
#     row_list = []
    
#     for o_i, o_v in enumerate(df[column].sort_index()):
#         for m_i, m_v in enumerate(df[column].sort_index()):
#             if o_i != m_i:
#                 dict1 = {
#                     'original_index': o_i,
#                     'original_value': o_v,
#                     'match_index': m_i,
#                     'match_value': m_v,
#                     'ratio_score': rfuzz.ratio(o_v, m_v),
#                     'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
#                     'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
#                     'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
#                     'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
#                 }
#                 row_list.append(dict1)
#     score_df = pd.DataFrame(row_list)
        
#     return score_df

In [94]:
# fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
# fuzzy_words_df

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
0,0,annex,1,frederick,14.285714,25.000000,14.285714,14.285714,0.437037
1,0,annex,2,and,50.000000,80.000000,50.000000,50.000000,0.688889
2,0,annex,3,shirley,16.666667,28.571429,16.666667,16.666667,0.447619
3,0,annex,4,street,18.181818,28.571429,18.181818,18.181818,0.455556
4,0,annex,5,po,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
4370185,2090,2ntl,2085,montagne,33.333333,50.000000,33.333333,33.333333,0.583333
4370186,2090,2ntl,2086,sterline,33.333333,50.000000,33.333333,33.333333,0.583333
4370187,2090,2ntl,2087,bav,0.000000,0.000000,0.000000,0.000000,0.000000
4370188,2090,2ntl,2088,hast,25.000000,40.000000,25.000000,25.000000,0.500000


In [95]:
fuzzy_words_df.describe()

Unnamed: 0,original_index,match_index,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
count,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0,4370190.0
mean,1045.0,1045.0,14.00943,22.4697,14.00803,14.00794,0.2240348
std,603.6197,603.6197,15.02418,22.66403,15.02212,15.02198,0.2557097
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,522.0,522.0,0.0,0.0,0.0,0.0,0.0
50%,1045.0,1045.0,14.28571,25.0,14.28571,14.28571,0.0
75%,1568.0,1568.0,23.52941,40.0,23.52941,23.52941,0.4666667
max,2090.0,2090.0,96.2963,100.0,100.0,100.0,0.9857143


In [97]:
fuzzy_words_df[(fuzzy_words_df['ratio_score']==0) & (fuzzy_words_df['partial_ratio_score']==0) & (fuzzy_words_df['token_sort_score']==0) & (fuzzy_words_df['token_set_score']==0) & (fuzzy_words_df['jaro_winkler_score']==0)]

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
4,0,annex,5,po,0.0,0.0,0.0,0.0,0.0
11,0,annex,12,court,0.0,0.0,0.0,0.0,0.0
14,0,annex,15,lyford,0.0,0.0,0.0,0.0,0.0
22,0,annex,23,third,0.0,0.0,0.0,0.0,0.0
23,0,annex,24,floor,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4370177,2090,2ntl,2077,rbc,0.0,0.0,0.0,0.0,0.0
4370182,2090,2ntl,2082,hamas,0.0,0.0,0.0,0.0,0.0
4370184,2090,2ntl,2084,zh,0.0,0.0,0.0,0.0,0.0
4370187,2090,2ntl,2087,bav,0.0,0.0,0.0,0.0,0.0


In [119]:
def calc_fuzz_df(df, column):
    row_list = []
    
    for o_i, o_v in enumerate(df[column].sort_index()):
        for m_i, m_v in enumerate(df[column].sort_index()):
            if o_i != m_i:
                dict1 = {
                    'original_index': o_i,
                    'original_value': o_v,
                    'match_index': m_i,
                    'match_value': m_v,
                    'ratio_score': rfuzz.ratio(o_v, m_v),
                    'partial_ratio_score': rfuzz.partial_ratio(o_v, m_v),
                    'token_sort_score': rfuzz.token_sort_ratio(o_v, m_v),
                    'token_set_score': rfuzz.token_set_ratio(o_v, m_v),
                    'jaro_winkler_score': jaro.jaro_winkler_metric(o_v, m_v)
                }
                if (dict1['ratio_score']>0) | (dict1['partial_ratio_score']>0) | (dict1['token_sort_score']>0) | (dict1['token_set_score']>0) | (dict1['jaro_winkler_score']>0):
                    row_list.append(dict1)
    score_df = pd.DataFrame(row_list)
        
    return score_df

In [120]:
fuzzy_words_df = calc_fuzz_df(freq_df, 'word')
fuzzy_words_df

Unnamed: 0,original_index,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
0,0,annex,1,frederick,14.285714,25.000000,14.285714,14.285714,0.437037
1,0,annex,2,and,50.000000,80.000000,50.000000,50.000000,0.688889
2,0,annex,3,shirley,16.666667,28.571429,16.666667,16.666667,0.447619
3,0,annex,4,street,18.181818,28.571429,18.181818,18.181818,0.455556
4,0,annex,6,box,25.000000,50.000000,25.000000,25.000000,0.000000
...,...,...,...,...,...,...,...,...,...
2458643,2090,2ntl,2081,tortola,36.363636,50.000000,36.363636,36.363636,0.595238
2458644,2090,2ntl,2083,switzerland,26.666667,33.333333,26.666667,26.666667,0.560606
2458645,2090,2ntl,2085,montagne,33.333333,50.000000,33.333333,33.333333,0.583333
2458646,2090,2ntl,2086,sterline,33.333333,50.000000,33.333333,33.333333,0.583333


In [125]:
fuzzy_words_df.groupby('original_index').count()

Unnamed: 0_level_0,original_value,match_index,match_value,ratio_score,partial_ratio_score,token_sort_score,token_set_score,jaro_winkler_score
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1468,1468,1468,1468,1468,1468,1468,1468
1,1254,1254,1254,1254,1254,1254,1254,1254
2,1303,1303,1303,1303,1303,1303,1303,1303
3,1260,1260,1260,1260,1260,1260,1260,1260
4,1150,1150,1150,1150,1150,1150,1150,1150
...,...,...,...,...,...,...,...,...
2086,1626,1626,1626,1626,1626,1626,1626,1626
2087,879,879,879,879,879,879,879,879
2088,1107,1107,1107,1107,1107,1107,1107,1107
2089,830,830,830,830,830,830,830,830


In [144]:
pd.DataFrame(fuzzy_words_df['original_index'].unique(), columns=['original_index'])

Unnamed: 0,original_index
0,0
1,1
2,2
3,3
4,4
...,...
2086,2086
2087,2087
2088,2088
2089,2089


In [154]:
pd.DataFrame.join?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mother[0m[0;34m:[0m [0;34m'FrameOrSeriesUnion'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'left'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlsuffix[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrsuffix[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Join columns of another DataFrame.

Join columns with `other` DataFrame e

In [206]:
index_col = 'original_index'
metric_cts = pd.DataFrame(fuzzy_words_df[index_col].unique(), columns=[index_col])

for metric in ['ratio_score', 'partial_ratio_score', 'token_sort_score', 'token_set_score']:
    met_df = fuzzy_words_df.loc[fuzzy_words_df[metric]>50, [index_col, metric]].groupby(index_col).count().reset_index()
    metric_cts = metric_cts.merge(met_df, on=index_col, how='outer')

In [207]:
metric_cts = fuzzy_words_df[[index_col, 'original_value']].drop_duplicates().merge(metric_cts, on=index_col, how='outer')
metric_cts.columns = ['original_index', 'original_value', 'ratio_match_ct', 'partial_ratio_match_ct', 'token_sort_match_ct', 'token_set_match_ct']
metric_cts

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct
0,0,annex,18.0,155.0,18.0,18.0
1,1,frederick,12.0,110.0,12.0,12.0
2,2,and,43.0,250.0,43.0,43.0
3,3,shirley,43.0,133.0,43.0,43.0
4,4,street,67.0,234.0,67.0,67.0
...,...,...,...,...,...,...
2086,2086,sterline,70.0,315.0,70.0,70.0
2087,2087,bav,14.0,115.0,14.0,14.0
2088,2088,hast,31.0,231.0,31.0,31.0
2089,2089,cojp,8.0,104.0,8.0,8.0


In [208]:
metric_cts[metric_cts['original_value'].str.contains('^n\d+|cb\d+')]

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct
7,7,n4805,97.0,238.0,97.0,97.0
14,14,n8188,66.0,225.0,66.0,66.0
18,18,n7785,81.0,198.0,81.0,81.0
19,19,n3708,89.0,271.0,89.0,89.0
25,25,n3024,100.0,267.0,100.0,100.0
27,27,n492,75.0,237.0,75.0,75.0
37,37,cb12399,52.0,143.0,52.0,52.0
43,43,n4875,115.0,268.0,115.0,115.0
55,55,n7768,56.0,190.0,56.0,56.0
60,60,n4755,74.0,227.0,74.0,74.0


In [211]:
metric_cts = metric_cts[~metric_cts['original_value'].str.contains('^n\d+|cb\d+|no\d+|\d+$')]
metric_cts

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct
0,0,annex,18.0,155.0,18.0,18.0
1,1,frederick,12.0,110.0,12.0,12.0
2,2,and,43.0,250.0,43.0,43.0
3,3,shirley,43.0,133.0,43.0,43.0
4,4,street,67.0,234.0,67.0,67.0
...,...,...,...,...,...,...
2086,2086,sterline,70.0,315.0,70.0,70.0
2087,2087,bav,14.0,115.0,14.0,14.0
2088,2088,hast,31.0,231.0,31.0,31.0
2089,2089,cojp,8.0,104.0,8.0,8.0


In [212]:
metric_cts.sort_values('ratio_match_ct', ascending=False).head(1000)

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct
2021,2021,stre,72.0,320.0,72.0,72.0
2086,2086,sterline,70.0,315.0,70.0,70.0
331,331,stret,70.0,297.0,70.0,70.0
4,4,street,67.0,234.0,67.0,67.0
1624,1624,steret,67.0,267.0,67.0,67.0
624,624,stree,65.0,301.0,65.0,65.0
1212,1212,streer,63.0,236.0,63.0,63.0
1476,1476,elite,62.0,284.0,62.0,62.0
969,969,strees,62.0,246.0,62.0,62.0
1167,1167,ste,61.0,291.0,61.0,61.0


In [213]:
metric_cts.sort_values('ratio_match_ct', ascending=False).tail(100)

Unnamed: 0,original_index,original_value,ratio_match_ct,partial_ratio_match_ct,token_sort_match_ct,token_set_match_ct
996,996,bluewood,5.0,74.0,5.0,5.0
1528,1528,pi,5.0,135.0,5.0,5.0
1009,1009,lennox,5.0,80.0,5.0,5.0
1491,1491,dajani,5.0,72.0,5.0,5.0
1488,1488,john,5.0,121.0,5.0,5.0
1084,1084,ibc,5.0,89.0,5.0,5.0
1061,1061,meifoo,5.0,54.0,5.0,5.0
1716,1716,dicks,5.0,85.0,5.0,5.0
1607,1607,douglas,5.0,107.0,5.0,5.0
382,382,2b,5.0,267.0,5.0,5.0


In [None]:
def get_set_ratio(row, str_to_match):
    name = row['address']
    return fuzz.token_set_ratio(name, str_to_match)

def get_sort_ratio(row, str_to_match):
    name = row['address']
    return fuzz.token_sort_ratio(name, str_to_match)

In [None]:
u_addresses = df['working_address'].unique()
for u_address in u_addresses:
    df['score'] = df.apply(get_sort_ratio, args=(u_address,), axis=1)
    match_df = sm_df[sm_df['score']>80].sort_values('score', ascending=False)
    if match_df.shape[0]>1:
        print(u_address)
        display(match_df)
        print('\n')

# Fuzzy Resources

- [Fuzzing matching in pandas with fuzzywuzzy](https://jonathansoma.com/lede/algorithms-2017/classes/fuzziness-matplotlib/fuzzing-matching-in-pandas-with-fuzzywuzzy/)
- [Best Libraries for Fuzzy Matching In Python](https://medium.com/codex/best-libraries-for-fuzzy-matching-in-python-cbb3e0ef87dd)
- [Fuzzy String Matching](https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe)
- [Fuzzy String Comparison](https://stackoverflow.com/a/28467760)
- [df['working_address'][1], df['working_address'][0])](https://www.geeksforgeeks.org/how-to-do-fuzzy-matching-on-pandas-dataframe-column-using-python/)
- []()