In [1]:
%load_ext cython

In [2]:
import py_entitymatching as em

In [3]:
A = em.load_dataset('person_table_A')
B = em.load_dataset('person_table_B')

def get_str_cols(dataframe):
    return dataframe.columns[dataframe.dtypes == 'object']

In [4]:
import os
def _get_stop_words():
    stop_words_set = set()
    install_path = em.get_install_path()
    dataset_path = os.sep.join([install_path, 'utils'])
    stop_words_file = os.sep.join([dataset_path, 'stop_words.txt'])
    with open(stop_words_file, "rb") as stopwords_file:
        for stop_words in stopwords_file:
            stop_words_set.add(stop_words.rstrip())

    return stop_words_set

In [23]:
stopwords

['all',
 'six',
 'less',
 'being',
 'indeed',
 'over',
 'move',
 'anyway',
 'four',
 'not',
 'own',
 'through',
 'yourselves',
 'fify',
 'where',
 'mill',
 'only',
 'find',
 'before',
 'one',
 'whose',
 'system',
 'how',
 'somewhere',
 'with',
 'show',
 'had',
 'enough',
 'should',
 'to',
 'must',
 'whom',
 'seeming',
 'whole',
 'under',
 'ours',
 'has',
 'might',
 'thereafter',
 'latterly',
 'do',
 'them',
 'his',
 'around',
 'than',
 'get',
 'very',
 'de',
 'none',
 'cannot',
 'every',
 'whether',
 'they',
 'front',
 'during',
 'thus',
 'now',
 'him',
 'nor',
 'name',
 'several',
 'hereafter',
 'always',
 'who',
 'cry',
 'whither',
 'this',
 'someone',
 'either',
 'each',
 'become',
 'thereupon',
 'sometime',
 'side',
 'two',
 'therein',
 'twelve',
 'because',
 'often',
 'ten',
 'our',
 'eg',
 'some',
 'back',
 'thickv',
 'go',
 'namely',
 'towards',
 'are',
 'further',
 'beyond',
 'ourselves',
 'yet',
 'out',
 'even',
 'will',
 'what',
 'still',
 'for',
 'bottom',
 'mine',
 'since',

In [42]:
%%cython
#distutils: language=c++
from libcpp.vector cimport vector
from libcpp.string cimport string
from libcpp.map cimport map as omap
from libcpp.set cimport set as oset
from libcpp.algorithm cimport sort
from libcpp cimport bool
from libcpp.pair cimport pair
cdef extern from "string.h":
    char *strtok_r (char *inp_str, const char *delimiters, char **)
########################################
cdef class StringContainer:
    cdef vector[string] sc
    cdef int csize(self):
        return self.sc.size()

        
    
    def size(self):
        return self.sc.size()
    def push_back(self, s):
        self.sc.push_back(s)
    def get(self, int i):
        return self.sc[i]
########################################
cdef class TokenContainer:
    cdef vector[vector[string]] tc
    
    cdef int csize(self):
        return self.tc.size()
    
    cdef void cinit(self, n):
        cdef int i
        for i in xrange(n):
            self.tc.push_back(vector[string]())
    cdef void cpush_back(self, vector[string] tokens):
        self.tc.push_back(tokens)

    cdef vector[string] cremove_stopwords(self,  vector[string]& svec, \
                                           omap[string, int]& stopwords):
        cdef vector[string] ovec
        cdef string token
        for token in svec:
            if (stopwords.find(token) == stopwords.end()):
                ovec.push_back(token)
        return ovec
        
    cdef vector[string] ctokenize_wd(self, const string& inp):
        cdef char* ptr1
        cdef char* pch = strtok_r(<char*> inp.c_str(), " \t\n", &ptr1)
        cdef oset[string] tokens
        cdef vector[string] out
        cdef string s
        while pch != NULL:
            tokens.insert(string(pch))
            pch = strtok_r(NULL, " \t\n", &ptr1)
        for s in tokens:
            out.push_back(s)
        return out
        
    cdef void ctokenize(self, vector[string]& svec,  omap[string, int]& stopwords):
        cdef int n = svec.size()
        cdef int i
        cdef string s
        cdef vector[string] tokens
        
        self.cinit(n)
        
        for i in xrange(n):
            s = svec[i]
            tokens = self.ctokenize_wd(s)
            tokens = self.cremove_stopwords(tokens, stopwords)
            self.tc[i] = tokens
        
    def tokenize(self, StringContainer objsc, stopwords):
        cdef omap[string, int] smap
        str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')
        if len(stopwords):
            for s in stopwords:
                smap[s] = 0
        self.ctokenize(objsc.sc, smap)
    
    def get(self, int i):
        return self.tc[i]
    
    def size(self):
        return self.csize()
 ########################################   

cdef class InvertedIndex:
    cdef omap[string, vector[int]] index
    
    cdef vector[int] cvalues(self, string token):
        cdef vector[int] tmp
        if self.index.find(token) != self.index.end():
            return self.index[token]
        else:
            return tmp
    
    cdef void cbuild_inv_index(self, vector[vector[string]]& token_vector):
        cdef int n = token_vector.size()
        cdef int i, j
        cdef int m
        cdef vector[string] tokens
        
        for i in xrange(n):
            tokens = token_vector[i]
            m = tokens.size()
            for j in xrange(m):
                self.index[tokens[j]].push_back(i)
            
    def build_inv_index(self, TokenContainer objtc):
        self.cbuild_inv_index(objtc.tc)
        
    def values(self, token):
        return self.cvalues(token)
    
    
 ########################################  
cdef bool comp(const pair[int, int] l, const pair[int, int] r):
    return l.second > r.second 
cdef class Prober:
    cdef vector[int] llocs
    cdef vector[int] rlocs
    
    cdef int clsize(self):
        return self.llocs.size()
    
    cdef int crsize(self):
        return self.rlocs.size()
    
    cdef vector[int] cget_llocs(self):
        sort(self.llocs.begin(), self.llocs.end())
        return self.llocs
    cdef vector[int] cget_rlocs(self):
        return self.rlocs
        

    cdef inline vector[int] cvalues(self, omap[string, vector[int]]& index, string token):
        cdef vector[int] tmp
        if index.find(token) != index.end():
            return index[token]
        else:
            return tmp
    
    
    cdef void cprobe(self, vector[vector[string]]& token_vector, \
                     vector[int] ids,
                     omap[string, vector[int]]& index, int yparam):
        cdef int m, n
        cdef int i, j, k
        cdef vector[string] tokens
        cdef oset[int] lset, rset
        cdef vector[int] candidates
        cdef omap[int, int] cand_overlap
        cdef pair[int, int] entry
        cdef vector[pair[int, int]] tmp
        cdef int rid
        cdef int mx = 0
        n = token_vector.size()
        for i in xrange(n):
            tokens = token_vector[i]
            rid = ids[i]
            for j in xrange(tokens.size()):
                candidates = self.cvalues(index, tokens[j])
#                 print(tokens[j])
#                 print(candidates)
#                 if candidates.size() > mx:
#                     mx = candidates.size()
#                     print(tokens[j])
#                     print(mx)
                for cand in candidates:
                    cand_overlap[cand] += 1
            if cand_overlap.size():
                rset.insert(i)
            for entry in cand_overlap:
                tmp.push_back(entry)
            sort(tmp.begin(), tmp.end(), comp)
            k = 0
            for entry in tmp:
                lset.insert(entry.first)
                k += 1
                if k == yparam:
                    break
            cand_overlap.clear()
            tmp.clear()
        for i in lset:
            self.llocs.push_back(i)
        for i in rset:
            self.rlocs.push_back(i)
    
    def probe(self, TokenContainer objtc, ids, InvertedIndex index, yparam):
        self.cprobe(objtc.tc, ids, index.index, yparam)
    
    def get_lids(self):
        return self.cget_llocs()
    def get_rids(self):
        return self.cget_rlocs()
        
                
                
                    
            
            
        
    
    

In [43]:
import string
# def preprocess_table(dataframe):
#     str_cols = get_str_cols(dataframe)
#     projected_df = dataframe[str_cols]
#     concat_strings = []

#     str_container_obj = StringContainer()
#     for row in projected_df.itertuples(name=None):
#         idx = row[0]
#         joined_row = ' '.join(row[1:])
#         joined_row = joined_row.translate(None, string.punctuation)
#         concat_strings.append(joined_row.lower())
#         str_container_obj.push_back(str2bytes(joined_row.lower()))
        
#     return str_container_obj

def preprocess_table(dataframe):
    str_cols = get_str_cols(dataframe)
    proj_df = dataframe[str_cols]
    concat_strings = []
    str_container = StringContainer()
    str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')
    for row in proj_df.itertuples(name=None):
        idx = row[0]
        column_values = row[1:]
        strs = [column_value.strip() for column_value in column_values if not pd.isnull(column_value)]
        joined_row = ' '.join(strs)
        joined_row = joined_row.translate(None, string.punctuation)
        concat_strings.append(joined_row.lower())
        str_container.push_back(str2bytes(joined_row.lower()))
    return str_container


def tokenize_strings(concat_strings, stopwords):
    n = concat_strings.size()
    tok_container_obj = TokenContainer()
    tok_container_obj.tokenize(concat_strings, stopwords)
    return tok_container_obj

def build_inv_index(tokens):
    inv_obj = InvertedIndex()
    inv_obj.build_inv_index(tokens)
    return inv_obj

def probe(tokens, n, invindex, y):
    probe_obj = Prober()
    probe_obj.probe(tokens, range(n), invindex, y)
    return probe_obj
        

In [44]:
import pandas as pd

C = pd.read_csv('songs.csv')
D = pd.read_csv('tracks.csv')

In [45]:
len(C), len(D)

(961593, 734485)

In [51]:
stopwords = list(_get_stop_words())
# stopwords.extend(['the', 'my', 'i', 'andre', 'from', 'a', 'of', 'the', 'version', 'love', 'live', 'la', 'mix', 'album', \
#                   'dont', 'remix', 'feat'])
stopwords.extend(['the', 'my', 'i', 'andre', 'from', 'a', 'of', 'the', 'version', 'love', 'live', 'la', 'mix', 'album', \
                  'dont'])

In [52]:

%time lconcat_strings = preprocess_table(C)
%time D1 = D.sample(10000, replace=False)
%time rconcat_strings = preprocess_table(D1)
# %time stopwords=['san', 'st', 'francisco']
%time ltokens = tokenize_strings(lconcat_strings, stopwords)
%time rtokens = tokenize_strings(rconcat_strings, stopwords)
%time inv_index = build_inv_index(ltokens)




CPU times: user 4.24 s, sys: 299 ms, total: 4.53 s
Wall time: 4.58 s
CPU times: user 46.2 ms, sys: 20.2 ms, total: 66.4 ms
Wall time: 66.8 ms
CPU times: user 72 ms, sys: 4.12 ms, total: 76.2 ms
Wall time: 73.7 ms
CPU times: user 5.87 s, sys: 110 ms, total: 5.98 s
Wall time: 6.01 s
CPU times: user 122 ms, sys: 878 µs, total: 123 ms
Wall time: 124 ms
CPU times: user 6.02 s, sys: 121 ms, total: 6.14 s
Wall time: 6.34 s


In [34]:
import py_entitymatching as em

In [53]:
%time probe_res = probe(rtokens, rtokens.size(), inv_index, 1)

CPU times: user 41.8 s, sys: 658 ms, total: 42.4 s
Wall time: 45.7 s


In [49]:
rids = (probe_res.get_rids())

In [50]:
D1.iloc[rids].head()

Unnamed: 0,id,title,year,episode,song,artists
222149,222149,SpongeBob SquarePants,1999.0,Jellyfishing/Plankton! (#1.3),House of Horror,w. merrick farran
245180,245180,The Fosters,2013.0,Vigil (#1.9),On the Other Side,phillip larue
325999,325999,Til There Was You,1997.0,,One Big Happy Family,winnie holzman+david evans+brock walsh+the walsh family singers
619867,619867,Smothered,2002.0,,Complete,kathryn sutherland+push3
357595,357595,Backroads to Vegas,1996.0,,Love Notes Intro,kirsten vogelsang+tony cultreri+mona gable


In [73]:
lconcat_strings.get(0)
ltokens.get(0)
inv_index.values('francisco')

[0, 1, 2, 3, 4]