In [2]:
import numpy as np
import pandas as pd
from functools import lru_cache

Manual implementation of a levenshtein function

From
https://rosettacode.org/wiki/Levenshtein_distance#Memoized_recursion
and 
https://www.datacamp.com/community/tutorials/fuzzy-string-python
and 


In [3]:
from functools import lru_cache
@lru_cache(maxsize=4095)
def ld(s, t):
    if not s: return len(t)
    if not t: return len(s)
    if s[0] == t[0]: return ld(s[1:], t[1:])
    l1 = ld(s, t[1:])
    l2 = ld(s[1:], t)
    l3 = ld(s[1:], t[1:])
    return 1 + min(l1, l2, l3)
assert ld("kitten","sitting") == 3, "Should be 3"
assert ld("rosettacode","raisethysword") == 8, "Should be 8"
assert ld("Paul jones","paul Jones") == 2, "Should be 2"

In [4]:
from functools import lru_cache
import string
l=["John Smith","Joe.Blogger","James O'Reilly", "Peter Woods","Michael Collins","John Smith","Smith, John","John Smitt", "James O'Reilly Jr.", "M Collins", "John Smith", "Peterson, Paul"]
lk=list(range(1,len(l)+1))
@lru_cache(maxsize=4095)
def pre(s):
    s=s.translate(str.maketrans('', '', string.punctuation))
    s= str.lower(s)
    sl= str.split(s)
    sl.sort()
    s= "".join(sl)
    return s
lpre=[pre(s) for s in l]
lpre


['johnsmith',
 'joeblogger',
 'jamesoreilly',
 'peterwoods',
 'collinsmichael',
 'johnsmith',
 'johnsmith',
 'johnsmitt',
 'jamesjroreilly',
 'collinsm',
 'johnsmith',
 'paulpeterson']

In [5]:
from itertools import combinations
item_combinations=list(combinations(lpre, 2))
print(len(item_combinations), "combinations to compute. Careful if this number is large, it may take a long time")

66 combinations to compute. Careful if this number is large, it may take a long time


In [6]:
#%%timeit -n1
matches = []
for x in item_combinations:
    d = ld(x[0],x[1])
    r=(len(x[0]+x[1]) - d) / (len(x[0]+x[1]))
    if r > .80:
        matches.append([x[0], x[1], d,  round(r*100)]) 
df_matches=pd.DataFrame(matches, columns=['x1','x2','distance','ratio'])

In [7]:
#we keep the very top but we retain the original matches to check if we have a lot more
likely_matches = df_matches[df_matches.ratio >= 0.95]
likely_matches= likely_matches.drop_duplicates()
print(likely_matches.shape)
likely_matches

(4, 4)


Unnamed: 0,x1,x2,distance,ratio
0,johnsmith,johnsmith,0,100
2,johnsmith,johnsmitt,1,94
4,jamesoreilly,jamesjroreilly,2,92
10,johnsmitt,johnsmith,1,94


In [8]:
def ld2(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #print (matrix)
    return (matrix[size_x - 1, size_y - 1])
assert ld2("kitten","sitting") == 3, "Should be 3"
assert ld2("rosettacode","raisethysword") == 8, "Should be 8"
assert ld2("Paul jones","paul Jones") == 2, "Should be 2"

In [9]:
from array import array
@lru_cache(maxsize=None)
#@jit(nopython=True)
def ld3(a, b, mx=-1):   
    def result(d): return d if mx < 0 else False if d > mx else True
    if a == b: return result(0)
    la, lb = len(a), len(b)
    if mx >= 0 and abs(la - lb) > mx: return result(mx+1)
    if la == 0: return result(lb)
    if lb == 0: return result(la)
    if lb > la: a, b, la, lb = b, a, lb, la
    cost = array('i', range(lb + 1))
    for i in range(1, la + 1):
        cost[0] = i; ls = i-1; mn = ls
        for j in range(1, lb + 1):
            ls, act = cost[j], ls + int(a[i-1] != b[j-1])
            cost[j] = min(ls+1, cost[j-1]+1, act)
            if (ls < mn): mn = ls
        if mx >= 0 and mn > mx: return result(mx+1)
    if mx >= 0 and cost[lb] > mx: return result(mx+1)
    return result(cost[lb])
 
print(
    ld3('kitten','kitten'), # 0
    ld3('kitten','sitten'), # 1
    ld3('kitten','sittes'), # 2
    ld3('kitten','sityteng'), # 3
    ld3('kitten','sittYing'), # 4
    ld3('rosettacode','raisethysword'), # 8 
    ld3('kitten','kittenaaaaaaaaaaaaaaaaa'), # 17
    ld3('kittenaaaaaaaaaaaaaaaaa','kitten') # 17
)

print(
    ld3('kitten','kitten',6), # True
    ld3('kitten','sitten',6), # True
    ld3('kitten','sittes',6), # True
    ld3('kitten','sityteng',6), # True
    ld3('kitten','sittYing',6), # False
    ld3('rosettacode','raisethysword',6), # False
    ld3('kitten','kittenaaaaaaaaaaaaaaaaa',6), # False
    ld3('kittenaaaaaaaaaaaaaaaaa','kitten',6) # False
)

 

 

0 1 2 3 4 8 17 17
True True True True True False False False
