# Fast Text Matching

In [1]:
import pandas as pd
from function import TextMatch

## 1. Pipeline

In [2]:
df = pd.read_csv('data.csv')
print(df.shape)

df['company_edit_name'] = df['Company Name'].map(lambda x: x.lower().strip())
df.head()

(50000, 4)


Unnamed: 0,Line Number,Company Name,Company CIK Key,company_edit_name
0,1,!J INC,1438823,!j inc
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607,"#1 a lifesafer holdings, inc."
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512,#1 arizona discount properties llc
3,4,#1 PAINTBALL CORP,1433777,#1 paintball corp
4,5,$ LLC,1427189,$ llc


In [3]:
item = df['company_edit_name'].unique().tolist()
match_df = TextMatch(item, item).run_match()
match_df

Process vectorize: 2.3s
Process optimized: 15.09s
49,627 items in BASE match 49,627 items in SOURCE with top 5 match and similarity threshold: 0.5


Unnamed: 0,base,source,similarity,rank
0,!j inc,!j inc,1.00,1
1,"#1 a lifesafer holdings, inc.","#1 a lifesafer holdings, inc.",1.00,1
2,#1 arizona discount properties llc,#1 arizona discount properties llc,1.00,1
3,#1 arizona discount properties llc,arizona income properties llc,0.61,2
4,#1 paintball corp,#1 paintball corp,1.00,1
...,...,...,...,...
191912,babb jack j,babb james g. iii,0.57,2
191913,babb jack j,allen jack j,0.54,3
191914,babb jack j,africk jack,0.51,4
191915,babb james g. iii,babb james g. iii,1.00,1


In [4]:
match_df.query('base == "aim variable insurance funds"')

Unnamed: 0,base,source,similarity,rank
68398,aim variable insurance funds,aim variable insurance funds,1.0,1
68399,aim variable insurance funds,aim variable insurance funds inc,0.97,2
68400,aim variable insurance funds,aim variable insurance funds (invesco variable...,0.88,3
68401,aim variable insurance funds,aip variable insurance trust,0.68,4
68402,aim variable insurance funds,access variable insurance trust,0.68,5


## 2. Reproduce

In [5]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
TextMatch(corpus, corpus).run_match()

Process vectorize: 0.0s
Process optimized: 0.01s
4 items in BASE match 4 items in SOURCE with top 5 match and similarity threshold: 0.5


Unnamed: 0,base,source,similarity,rank
0,This is the first document.,This is the first document.,1.0,1
1,This is the first document.,Is this the first document?,0.79,2
2,This is the first document.,This document is the second document.,0.54,3
3,This document is the second document.,This document is the second document.,1.0,1
4,This document is the second document.,This is the first document.,0.54,2
5,And this is the third one.,And this is the third one.,1.0,1
6,Is this the first document?,Is this the first document?,1.0,1
7,Is this the first document?,This is the first document.,0.79,2


### 2.1 N-grams

In [6]:
from re import sub


def ngrams_func(string, n=3):
    string = sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print(ngrams_func(corpus[0]))

['Thi', 'his', 'is ', 's i', ' is', 'is ', 's t', ' th', 'the', 'he ', 'e f', ' fi', 'fir', 'irs', 'rst', 'st ', 't d', ' do', 'doc', 'ocu', 'cum', 'ume', 'men', 'ent']


### 2.2 TFIDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(analyzer=ngrams_func)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

[' do' ' fi' ' is' ' on' ' se' ' th' 'And' 'Is ' 'Thi' 'con' 'cum' 'd d'
 'd o' 'd t' 'doc' 'e f' 'e s' 'e t' 'eco' 'ent' 'fir' 'he ' 'hir' 'his'
 'ird' 'irs' 'is ' 'men' 'nd ' 'nt ' 'nt?' 'ocu' 'ond' 'one' 'rd ' 'rst'
 's d' 's i' 's t' 'sec' 'st ' 't d' 't i' 'the' 'thi' 'ume']


In [8]:
X.toarray()

array([[0.18984129, 0.23449176, 0.18984129, 0.        , 0.        ,
        0.15520769, 0.        , 0.        , 0.23449176, 0.        ,
        0.18984129, 0.        , 0.        , 0.        , 0.18984129,
        0.23449176, 0.        , 0.        , 0.        , 0.18984129,
        0.23449176, 0.15520769, 0.        , 0.15520769, 0.        ,
        0.23449176, 0.31041538, 0.18984129, 0.        , 0.        ,
        0.        , 0.18984129, 0.        , 0.        , 0.        ,
        0.23449176, 0.        , 0.23449176, 0.15520769, 0.        ,
        0.23449176, 0.23449176, 0.        , 0.15520769, 0.        ,
        0.18984129],
       [0.25275498, 0.        , 0.12637749, 0.        , 0.19799476,
        0.10332188, 0.        , 0.        , 0.15610134, 0.19799476,
        0.25275498, 0.19799476, 0.        , 0.        , 0.25275498,
        0.        , 0.19799476, 0.        , 0.19799476, 0.25275498,
        0.        , 0.10332188, 0.        , 0.10332188, 0.        ,
        0.        , 0.20664

### 2.3 CSR

In [9]:
from sparse_dot_topn import awesome_cossim_topn
import numpy as np

matches = awesome_cossim_topn(X, X.transpose(), 10, 0.6, use_threads=True, n_jobs=4)
matches.toarray()

array([[1.        , 0.        , 0.        , 0.79353604],
       [0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        ],
       [0.79353604, 0.        , 0.        , 1.        ]])

In [10]:
non_zeros = matches.nonzero()

sparserows = non_zeros[0]
sparsecols = non_zeros[1]

nr_matches = sparsecols.size

left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)

for index in range(0, nr_matches):
    left_side[index] = corpus[sparserows[index]]
    right_side[index] = corpus[sparsecols[index]]
    similairity[index] = matches.data[index]

pd.DataFrame({'base': left_side, 'source': right_side, 'similarity': similairity})

Unnamed: 0,base,source,similarity
0,This is the first document.,This is the first document.,1.0
1,This is the first document.,Is this the first document?,0.793536
2,This document is the second document.,This document is the second document.,1.0
3,And this is the third one.,And this is the third one.,1.0
4,Is this the first document?,Is this the first document?,1.0
5,Is this the first document?,This is the first document.,0.793536
