In [1]:
import numpy as np

In [None]:
def jaro (string1, string2):
    '''AN APPLICATION OF THE FELLEGI-SUNTER MODEL OF RECORD LINKAGE TO THE 1990 U.S. DECENNIAL CENSUS
    William E. Winkler and Yves Thibaudeau U.S. Bureau of the Census '''
    '''or see also: https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/'''
    if len(set(string1) & set(string2)) == 0:
        return 0
    
    if string1 == string2:
        return 1

    # matches
    cnt = (max(len(string1), len(string2))//2) - 1
    assigned1 = np.array(['/']*len(string1))
    assigned2 = np.array(['/']*len(string2))
    
    m = 0
    for i in range(len(string1)):
        for j in range(max(0,(i-cnt)), min(len(string2), i+cnt+1)): # to limit the number of operations. the loop starts at i-max_length and only goes to either i+max_length+1 (since range is exclusive) or (if the end of string2 is closer than max_length) to the end of string2
            if string1[i] == string2[j] and assigned1[i] == '/' and assigned2[j] == '/': # the last two condition are so that there is no overwrite of already assigned values
                assigned1[i] = string1[i]
                assigned2[j] = string2[j]
                m += 1
    if m == 0:
        return 0 
    
    assigned1 = np.delete(assigned1, np.where(assigned1 == '/')) # removing all the spaceholders that were needed to 
    assigned2 = np.delete(assigned2, np.where(assigned2 == '/'))
    
    # transpositions
    t = 0
    if all(assigned1 == assigned2):
        t = 0
    elif all(assigned1 == '/') and all(assigned2 == '/'):
        t = 0
    else:
        for i in range(len(assigned1)):
            if assigned1[i] != assigned2[i]:
                t += 1
        t = t/2
        
    return  ((m/len(string1) + m/len(string2) + (m-t)/m) / 3)

In [2]:
def jaro_winkler(string1, string2, scaling=0.1):
    jaro_dist = jaro(string1,string2)
    print(jaro_dist)
    
    if jaro_dist == 1:
        return 1
    
    if jaro_dist >= 0.6:
        prefix = 0
        for i in range(min(5, min(len(string1), len(string2)))):
            if string1[i] == string2[i]:
                prefix += 1
            else:
                break
        return (jaro_dist + prefix * scaling * (1 - jaro_dist))
    
    else:
        return jaro_dist

In [None]:
%%time
## unit test: see p. 13 in above cited paper
s9 = 'cunningham'
s10='cunnigham'
jaro_winkler(s9,s10) # should be .9833, is: 0.98333333..