In [62]:
import numpy as np
from sklearn.cluster import AffinityPropagation
import distance
    
words = "john juan jose john joan luis louise juans johny jon lois Louis".split(" ") #Replace this line
words = np.unique(np.char.lower(np.asarray(words))) #So that indexing with a list will work

#we are ignoring exact duplicates and lowering the cases, so when we get the clusters 
#we will have to go back to searching which records have that label, including exact duplicates and any other near matches due to preprocessing (i.e. lowering case, or removing Mr. and Ltd.).

similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

similarity

array([[ 0, -1, -2, -1, -2, -1, -2, -3, -4, -5, -4],
       [-1,  0, -1, -1, -2, -2, -3, -3, -4, -5, -4],
       [-2, -1,  0, -2, -3, -3, -3, -4, -4, -5, -5],
       [-1, -1, -2,  0, -2, -2, -3, -3, -4, -5, -4],
       [-2, -2, -3, -2,  0, -3, -4, -3, -4, -3, -4],
       [-1, -2, -3, -2, -3,  0, -1, -4, -4, -5, -3],
       [-2, -3, -3, -3, -4, -1,  0, -4, -4, -5, -3],
       [-3, -3, -4, -3, -3, -4, -4,  0, -1, -2, -1],
       [-4, -4, -4, -4, -4, -4, -4, -1,  0, -1, -1],
       [-5, -5, -5, -5, -3, -5, -5, -2, -1,  0, -2],
       [-4, -4, -5, -4, -4, -3, -3, -1, -1, -2,  0]])

In [63]:
affprop = AffinityPropagation(affinity="precomputed", damping=0.5, random_state=None)
affprop.fit(similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    #cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster = words[np.nonzero(affprop.labels_==cluster_id)] 
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))


 - *john:* joan, john, johny, jon, jose
 - *juan:* juan, juans
 - *louis:* lois, louis, louise, luis


In [88]:
import itertools
#we are simulating a scenario where we have already the individual distances (pairwise) and we need to
#construct the matrix
list(itertools.combinations(words, 2))
lev_pairwise_dict={(w1,w2):-distance.levenshtein(w1,w2)for [w1,w2] in itertools.combinations(words,2)}
print(lev_pairwise_dict)

{('joan', 'john'): -1, ('joan', 'johny'): -2, ('joan', 'jon'): -1, ('joan', 'jose'): -2, ('joan', 'juan'): -1, ('joan', 'juans'): -2, ('joan', 'lois'): -3, ('joan', 'louis'): -4, ('joan', 'louise'): -5, ('joan', 'luis'): -4, ('john', 'johny'): -1, ('john', 'jon'): -1, ('john', 'jose'): -2, ('john', 'juan'): -2, ('john', 'juans'): -3, ('john', 'lois'): -3, ('john', 'louis'): -4, ('john', 'louise'): -5, ('john', 'luis'): -4, ('johny', 'jon'): -2, ('johny', 'jose'): -3, ('johny', 'juan'): -3, ('johny', 'juans'): -3, ('johny', 'lois'): -4, ('johny', 'louis'): -4, ('johny', 'louise'): -5, ('johny', 'luis'): -5, ('jon', 'jose'): -2, ('jon', 'juan'): -2, ('jon', 'juans'): -3, ('jon', 'lois'): -3, ('jon', 'louis'): -4, ('jon', 'louise'): -5, ('jon', 'luis'): -4, ('jose', 'juan'): -3, ('jose', 'juans'): -4, ('jose', 'lois'): -3, ('jose', 'louis'): -4, ('jose', 'louise'): -3, ('jose', 'luis'): -4, ('juan', 'juans'): -1, ('juan', 'lois'): -4, ('juan', 'louis'): -4, ('juan', 'louise'): -5, ('juan'

In [89]:

unique_ids=sorted(set([x.lower() for y in lev_pairwise_dict.keys() for x in y]))
#this is a very elegant iteration over each element of the tuple of keys, flattening it
#the closer to a simpler version is
#el=[]
#for y in lev_pairwise_dict.keys():
#    for x in y:
#        el.append(x.lower())
#unique_ids=sorted(set(el)) #set returns the unique elements  
#I have to say I prefer the loop version for readability, just a tiny bit more legible imo
unique_ids

['joan',
 'john',
 'johny',
 'jon',
 'jose',
 'juan',
 'juans',
 'lois',
 'louis',
 'louise',
 'luis']

In [90]:
import pandas as pd
df = pd.DataFrame(index=unique_ids, columns=unique_ids)
for k, v in lev_pairwise_dict.items():
    df.loc[k[0], k[1]] = v
    df.loc[k[1], k[0]] = v
df=df.fillna(0)
similarity2=np.array(df)
print(unique_ids)
df

['joan', 'john', 'johny', 'jon', 'jose', 'juan', 'juans', 'lois', 'louis', 'louise', 'luis']


Unnamed: 0,joan,john,johny,jon,jose,juan,juans,lois,louis,louise,luis
joan,0,-1,-2,-1,-2,-1,-2,-3,-4,-5,-4
john,-1,0,-1,-1,-2,-2,-3,-3,-4,-5,-4
johny,-2,-1,0,-2,-3,-3,-3,-4,-4,-5,-5
jon,-1,-1,-2,0,-2,-2,-3,-3,-4,-5,-4
jose,-2,-2,-3,-2,0,-3,-4,-3,-4,-3,-4
juan,-1,-2,-3,-2,-3,0,-1,-4,-4,-5,-3
juans,-2,-3,-3,-3,-4,-1,0,-4,-4,-5,-3
lois,-3,-3,-4,-3,-3,-4,-4,0,-1,-2,-1
louis,-4,-4,-4,-4,-4,-4,-4,-1,0,-1,-1
louise,-5,-5,-5,-5,-3,-5,-5,-2,-1,0,-2


In [91]:
similarity2

array([[ 0, -1, -2, -1, -2, -1, -2, -3, -4, -5, -4],
       [-1,  0, -1, -1, -2, -2, -3, -3, -4, -5, -4],
       [-2, -1,  0, -2, -3, -3, -3, -4, -4, -5, -5],
       [-1, -1, -2,  0, -2, -2, -3, -3, -4, -5, -4],
       [-2, -2, -3, -2,  0, -3, -4, -3, -4, -3, -4],
       [-1, -2, -3, -2, -3,  0, -1, -4, -4, -5, -3],
       [-2, -3, -3, -3, -4, -1,  0, -4, -4, -5, -3],
       [-3, -3, -4, -3, -3, -4, -4,  0, -1, -2, -1],
       [-4, -4, -4, -4, -4, -4, -4, -1,  0, -1, -1],
       [-5, -5, -5, -5, -3, -5, -5, -2, -1,  0, -2],
       [-4, -4, -5, -4, -4, -3, -3, -1, -1, -2,  0]])

In [94]:
from functools import lru_cache
from timeit import repeat
similarity3=np.zeros((len(unique_ids),len(unique_ids)))
@lru_cache(None)
def find_index(k):
    return unique_ids.index(k)
for k, v in lev_pairwise_dict.items():
    try:
        ki0=find_index(k[0])
        ki1=find_index(k[1])
        similarity3[ki0, ki1 ]= v
        similarity3[ki1, ki0 ]= v
    except:
        print (k,v)
        raise "Element not found in headers"
        
similarity3

array([[ 0., -1., -2., -1., -2., -1., -2., -3., -4., -5., -4.],
       [-1.,  0., -1., -1., -2., -2., -3., -3., -4., -5., -4.],
       [-2., -1.,  0., -2., -3., -3., -3., -4., -4., -5., -5.],
       [-1., -1., -2.,  0., -2., -2., -3., -3., -4., -5., -4.],
       [-2., -2., -3., -2.,  0., -3., -4., -3., -4., -3., -4.],
       [-1., -2., -3., -2., -3.,  0., -1., -4., -4., -5., -3.],
       [-2., -3., -3., -3., -4., -1.,  0., -4., -4., -5., -3.],
       [-3., -3., -4., -3., -3., -4., -4.,  0., -1., -2., -1.],
       [-4., -4., -4., -4., -4., -4., -4., -1.,  0., -1., -1.],
       [-5., -5., -5., -5., -3., -5., -5., -2., -1.,  0., -2.],
       [-4., -4., -5., -4., -4., -3., -3., -1., -1., -2.,  0.]])

In [95]:
unique_ids
array_ids=np.asarray(unique_ids)
array_ids

array(['joan', 'john', 'johny', 'jon', 'jose', 'juan', 'juans', 'lois',
       'louis', 'louise', 'luis'], dtype='<U6')

In [100]:
affprop = AffinityPropagation(affinity="precomputed", damping=0.8, random_state=None)
affprop.fit(similarity3)
for cluster_id in np.unique(affprop.labels_):
    exemplar = array_ids[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(array_ids[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print("%s: %s" % (exemplar, cluster_str))

john: john, johny, jon, jose
juan: joan, juan, juans
louis: lois, louis, louise, luis


In [98]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=3,min_samples=1)
dbscan.fit(similarity3)

print (dbscan.labels_)
for cluster_id in np.unique(dbscan.labels_):
    cluster = np.unique(array_ids[np.nonzero(dbscan.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print("%s: %s" % (cluster_id, cluster_str))

[0 0 0 0 1 0 0 2 2 3 2]
0: joan, john, johny, jon, juan, juans
1: jose
2: lois, louis, luis
3: louise


In [99]:
dbscan.labels_

array([0, 0, 0, 0, 1, 0, 0, 2, 2, 3, 2])