### Import libraries

In [1]:
import re
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

In [2]:
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

### Set params

In [3]:
data_location = '../local/'
file_name = 'sec_edgar_company_info.csv'

### Load tables

In [4]:
df = pd.read_csv(os.path.join(data_location, file_name))
df.shape

(663000, 3)

In [5]:
df.head()

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


### Fuzzy matching

In [6]:
def text_preprocessing(text):
    text = re.sub(r' +', r' ', text)
    return text.lower()

In [7]:
def ngrams(string: str, n=3) -> list:
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [8]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [9]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similarity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similarity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                         'right_side': right_side,
                         'similarity': similarity})
        

In [10]:
def fuzzy_matching(target: list, topn = 10, similarity = .8):
    target = [text_preprocessing(x) for x in target]
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(target)
    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), topn, similarity)
    matches_df = get_matches_df(matches, target, top=100000)
    matches_df = matches_df[matches_df['similarity'] < .9999] # remove all exact matches
    return matches_df.sort_values(['similarity'], ascending=False)

In [11]:
result = fuzzy_matching(df['Company Name'][:100000])

In [12]:
result.head()

Unnamed: 0,left_side,right_side,similarity
73858,"angle light capital, lp - angle light capital - quasar series ii","angle light capital, lp - angle light capital - quasar series i",0.996847
73855,"angle light capital, lp - angle light capital - quasar series i","angle light capital, lp - angle light capital - quasar series ii",0.996847
76586,"apollo european principal finance fund iii (dollar b), l.p.","apollo european principal finance fund ii (dollar b), l.p.",0.993399
76570,"apollo european principal finance fund ii (dollar b), l.p.","apollo european principal finance fund iii (dollar b), l.p.",0.993399
76578,"apollo european principal finance fund ii (euro b), l.p.","apollo european principal finance fund iii (euro b), l.p.",0.993368


In [13]:
# credit: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html