In [None]:
!pip install sparse_dot_topn 

The problem with Fuzzy Matching on large data

There are many algorithms which can provide fuzzy matching (see here how to implement in Python) but they quickly fall down when used on even modest data sets of greater than a few thousand records.
The reason for this is that they compare each record to all the other records in the data set. In computer science, this is known as quadratic time and can quickly form a barrier when dealing with larger data sets.
A relative small data set of 10k records would require 100m operations.

https://colab.research.google.com/drive/1qhBwDRitrgapNhyaHGxCW8uKK5SWJblW
    

https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

Data for this case obtained from:

https://www.gov.uk/contracts-finder

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sparse_dot_topn.sparse_dot_topn as ct
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import re

In [19]:
df_full=pd.read_csv("./test_data/notices.csv")
df.shape

(1000, 43)

In [20]:
df.head()

Unnamed: 0,Notice Identifier,Notice Type,Organisation Name,Status,Published Date,Title,Description,Nationwide,Postcode,Region,...,Value High,Awarded Date,Awarded Value,Supplier [Name|Address|Ref type|Ref Number|Is SME|Is VCSE],Supplier's contact name,Contract start date,Contract end date,OJEU Procedure Type,Accelerated Justification,Closing Time
0,ENFLD001-DN480948-75134215,Contract,London Borough of Enfield,Open,2020-09-11T21:26:08Z,Procurement of Extra Care Services,To provide high-quality adult social care serv...,,,London,...,6993742.0,,,,,,,Restricted,,13:00
1,2021-2019,Contract,SOUTH YORKSHIRE POLICE AND CRIME COMMISSIONER,Awarded,2020-09-11T20:08:55Z,"Telephony Upgrade, Support & Maintenance",** Please note that this is a Contract Award N...,,,Yorkshire and the Humber,...,,28/07/2020,815000.0,"[4NET TECHNOLOGIES LTD|3 Scholar Green Road, S...",Jeremy Astin,01/09/2020,31/08/2027,CallOffFromFrameworkAgreement,,12:00
2,20200911200029-72814,Contract,United Kingdom Atomic Energy Authority,Open,2020-09-11T20:00:33Z,ExCALIBUR Fusion Modelling System. FM-WP1 Nume...,ExCALIBUR Fusion Modelling System\r\nFM-WP1 Nu...,,,Any region,...,252000.0,,,,,,,Open,,12:00
3,FSCS SS 031,Contract,FINANCIAL SERVICES COMPENSATION SCHEME LIMITED,Awarded,2020-09-11T19:37:46Z,Real-time GBR address verification,Data capture solution that offers real-time GB...,,,United Kingdom,...,,28/08/2020,81933.0,[Experian Limited|Sir John Peace Building Expe...,,01/09/2020,31/08/2021,SingleTenderActionNonOJEU,,00:00
4,tender_242636/872111,Contract,capitalEsourcing,Open,2020-09-11T19:28:37Z,WCC Right of Light Framework Agreement,The Westminster City Council seeks expressions...,,SW1E 6QP,London,...,14000000.0,,,,,,,OpenOJEU,,12:00


In [15]:
def ngrams(string, n=3):
    string = fix_text(string) # fix text encoding issues
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

You could use the cosine similarity function from Scikit here however it is not the most efficient way of finding close matches as it returns a closeness score for every item in the dataset for each sample. Instead, we are going to use a faster implementation of this which can be found here:
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [6]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
org_names = names['buyer'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(org_names)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

## Record linkage and a different approach

If we want to use this technique to match against another data source then we can recycle the majority of our code. In the below section we will see how this is achieved and also use the K Nearest Neighbour algorithm as an alternative closeness measure.
The dataset we would like to join on is a set of ‘clean’ organisation names created by the Office for National Statistics (ONS):

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
clean_org_names = pd.read_excel('Gov Orgs ONS.xlsx')
clean_org_names = clean_org_names.iloc[:, 0:6]
org_name_clean = clean_org_names['Institutions'].unique()
print('Vectorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(org_name_clean)
print('Vectorizing completed...')
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
org_column = 'buyer' #column to match against in the messy data
unique_org = set(names[org_column].values) # set used for increased performance
###matching query:
def getNearestN(query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

import time
t1 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(unique_org)
t = time.time()-t1
print("COMPLETED IN:", t)
unique_org = list(unique_org) #need to convert back to a list
print('finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [round(distances[i][0],2), clean_org_names.values[j][0][0],unique_org[i]]
  matches.append(temp)
print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['Match confidence (lower is better)','Matched name','Origional name'])
print('Done')