In [24]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
import time


In [10]:
names =  pd.read_csv('sec__edgar_company_info.csv')
names.head()

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


N-Grams

In [12]:

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds')

All 3-grams in "McDonalds":


['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']

In [14]:
company_names = names['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [15]:
print(tf_idf_matrix[0])

# Check if this makes sense:

ngrams('!J INC')

  (0, 14549)	0.15757684989695478
  (0, 812)	0.14545326532967898
  (0, 14951)	0.517420185391102
  (0, 1395)	0.828425757525274


['!J ', 'J I', ' IN', 'INC']

Cosine Similarity

In [18]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [20]:
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 4064.3161499500275


In [21]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [22]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
65549,AMERICAN CENTURY INVESTMENT MANAGEMENT INC,AMERICAN CENTURY INVESTMENT TRUST,0.801774
69787,AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 1999-C,AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 1999-D,0.919151
15728,"ACTIVANT RN HOLDINGS, LP","ACTIVANT RN CI HOLDINGS, LP",0.852126
26568,ADVISORS DISCIPLINED TRUST 1334,ADVISORS DISCIPLINED TRUST 1333,0.884002
10262,ABRAHAM & LONDON SECURITIES INC /BD,FIRST GRAHAM SECURITIES INC /BD,0.981131
28486,ADVISORS DISCIPLINED TRUST 1509,ADVISORS DISCIPLINED TRUST 150,0.941103
35002,ADVISORS DISCIPLINED TRUST 472,ADVISORS DISCIPLINED TRUST 474,0.870395
69831,AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2000-A,AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2006-1,0.90169
46151,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VA 76,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VA 73,0.936436
79436,"ANGELLIST-OOF-FUND, A SERIES OF ANGELLIST FUNDS, LLC","ANGELLIST-SINT-FUND, A SERIES OF ANGELLIST FUNDS, LLC",0.802388


In [23]:
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
54674,ALLEN & CO INC /BD,MULLEN & CO INC /BD,0.99893
5861,A P SECURITIES INC /BD,J P SECURITIES INC /BD,0.998636
47650,AGS SECURITIES INC /BD,CS SECURITIES INC /BD,0.998415
99706,ASTOR SECURITIES INC /BD,MENTOR SECURITIES INC /BD,0.998364
63516,AM CAPITAL LLC /BD,ML CAPITAL LLC /BD,0.99834
41599,AE PARTNERS LLC /BD,LANE PARTNERS LLC /BD,0.998318
48229,AI SECURITIES INC /BD,CS SECURITIES INC /BD,0.998301
48230,AI SECURITIES INC /BD,USI SECURITIES INC /BD,0.998287
54147,ALL IN LLC /BD,WYN LLC /BD,0.998228
63517,AM CAPITAL LLC /BD,THOR CAPITAL LLC /BD,0.998185
