In [1]:
%load_ext cython

In [50]:
import py_entitymatching as em

In [51]:
A = em.load_dataset('person_table_A')
B = em.load_dataset('person_table_B')

def get_str_cols(dataframe):
    return dataframe.columns[dataframe.dtypes == 'object']

In [116]:
%%cython
#distutils: language=c++
from libcpp.vector cimport vector
from libcpp.string cimport string
from libcpp.map cimport map as omap
from libcpp.set cimport set as oset
from libcpp.algorithm cimport sort
from libcpp cimport bool
from libcpp.pair cimport pair
cdef extern from "string.h":
    char *strtok_r (char *inp_str, const char *delimiters, char **)
########################################
cdef class StringContainer:
    cdef vector[string] sc
    cdef int csize(self):
        return self.sc.size()

        
    
    def size(self):
        return self.sc.size()
    def push_back(self, s):
        self.sc.push_back(s)
    def get(self, int i):
        return self.sc[i]
########################################
cdef class TokenContainer:
    cdef vector[vector[string]] tc
    
    cdef int csize(self):
        return self.tc.size()
    
    cdef void cinit(self, n):
        cdef int i
        for i in xrange(n):
            self.tc.push_back(vector[string]())
    cdef void cpush_back(self, vector[string] tokens):
        self.tc.push_back(tokens)

    cdef vector[string] cremove_stopwords(self,  vector[string]& svec, \
                                           omap[string, int]& stopwords):
        cdef vector[string] ovec
        cdef string token
        for token in svec:
            if (stopwords.find(token) == stopwords.end()):
                ovec.push_back(token)
        return ovec
        
    cdef vector[string] ctokenize_wd(self, const string& inp):
        cdef char* ptr1
        cdef char* pch = strtok_r(<char*> inp.c_str(), " \t\n", &ptr1)
        cdef oset[string] tokens
        cdef vector[string] out
        cdef string s
        while pch != NULL:
            tokens.insert(string(pch))
            pch = strtok_r(NULL, " \t\n", &ptr1)
        for s in tokens:
            out.push_back(s)
        return out
        
    cdef void ctokenize(self, vector[string]& svec,  omap[string, int]& stopwords):
        cdef int n = svec.size()
        cdef int i
        cdef string s
        cdef vector[string] tokens
        
        self.cinit(n)
        
        for i in xrange(n):
            s = svec[i]
            tokens = self.ctokenize_wd(s)
            tokens = self.cremove_stopwords(tokens, stopwords)
            self.tc[i] = tokens
        
    def tokenize(self, StringContainer objsc, stopwords):
        cdef omap[string, int] smap
        str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')
        if len(stopwords):
            for s in stopwords:
                smap[s] = 0
        self.ctokenize(objsc.sc, smap)
    
    def get(self, int i):
        return self.tc[i]
    
    def size(self):
        return self.csize()
 ########################################   

cdef class InvertedIndex:
    cdef omap[string, vector[int]] index
    
    cdef vector[int] cvalues(self, string token):
        cdef vector[int] tmp
        if self.index.find(token) != self.index.end():
            return self.index[token]
        else:
            return tmp
    
    cdef void cbuild_inv_index(self, vector[vector[string]]& token_vector):
        cdef int n = token_vector.size()
        cdef int i, j
        cdef int m
        cdef vector[string] tokens
        
        for i in xrange(n):
            tokens = token_vector[i]
            m = tokens.size()
            for j in xrange(m):
                self.index[tokens[j]].push_back(i)
            
    def build_inv_index(self, TokenContainer objtc):
        self.cbuild_inv_index(objtc.tc)
        
    def values(self, token):
        return self.cvalues(token)
    
    
 ########################################  
cdef bool comp(const pair[int, int] l, const pair[int, int] r):
    return l.second > r.second 
cdef class Prober:
    cdef vector[int] llocs
    cdef vector[int] rlocs
    
    cdef int clsize(self):
        return self.llocs.size()
    
    cdef int crsize(self):
        return self.rlocs.size()
    
    cdef vector[int] cget_llocs(self):
        sort(self.llocs.begin(), self.llocs.end())
        return self.llocs
    cdef vector[int] cget_rlocs(self):
        return self.rlocs
        

    cdef inline vector[int] cvalues(self, omap[string, vector[int]]& index, string token):
        cdef vector[int] tmp
        if index.find(token) != index.end():
            return index[token]
        else:
            return tmp
    
    
    cdef void cprobe(self, vector[vector[string]]& token_vector, \
                     vector[int] ids,
                     omap[string, vector[int]]& index, int yparam):
        cdef int m, n
        cdef int i, j, k
        cdef vector[string] tokens
        cdef oset[int] lset, rset
        cdef vector[int] candidates
        cdef omap[int, int] cand_overlap
        cdef pair[int, int] entry
        cdef vector[pair[int, int]] tmp
        cdef int rid
        
        n = token_vector.size()
        for i in xrange(n):
            tokens = token_vector[i]
            rid = ids[i]
            for j in xrange(tokens.size()):
                candidates = self.cvalues(index, tokens[j])
#                 print(tokens[j])
#                 print(candidates)
                for cand in candidates:
                    cand_overlap[cand] += 1
            if cand_overlap.size():
                rset.insert(i)
            for entry in cand_overlap:
                tmp.push_back(entry)
            sort(tmp.begin(), tmp.end(), comp)
            k = 0
            for entry in tmp:
                lset.insert(entry.first)
                k += 1
                if k == yparam:
                    break
            cand_overlap.clear()
            tmp.clear()
        for i in lset:
            self.llocs.push_back(i)
        for i in rset:
            self.rlocs.push_back(i)
    
    def probe(self, TokenContainer objtc, ids, InvertedIndex index, yparam):
        self.cprobe(objtc.tc, ids, index.index, yparam)
    
    def get_lids(self):
        return self.cget_llocs()
    def get_rids(self):
        return self.cget_rlocs()
        
                
                
                    
            
            
        
    
    

In [111]:
import string
# def preprocess_table(dataframe):
#     str_cols = get_str_cols(dataframe)
#     projected_df = dataframe[str_cols]
#     concat_strings = []

#     str_container_obj = StringContainer()
#     for row in projected_df.itertuples(name=None):
#         idx = row[0]
#         joined_row = ' '.join(row[1:])
#         joined_row = joined_row.translate(None, string.punctuation)
#         concat_strings.append(joined_row.lower())
#         str_container_obj.push_back(str2bytes(joined_row.lower()))
        
#     return str_container_obj

def preprocess_table(dataframe):
    str_cols = get_str_cols(dataframe)
    proj_df = dataframe[str_cols]
    concat_strings = []
    str_container = StringContainer()
    str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')
    for row in proj_df.itertuples(name=None):
        idx = row[0]
        column_values = row[1:]
        strs = [column_value.strip() for column_value in column_values if not pd.isnull(column_value)]
        joined_row = ' '.join(strs)
        joined_row = joined_row.translate(None, string.punctuation)
        concat_strings.append(joined_row.lower())
        str_container.push_back(str2bytes(joined_row.lower()))
    return str_container


def tokenize_strings(concat_strings, stopwords):
    n = concat_strings.size()
    tok_container_obj = TokenContainer()
    tok_container_obj.tokenize(concat_strings, stopwords)
    return tok_container_obj

def build_inv_index(tokens):
    inv_obj = InvertedIndex()
    inv_obj.build_inv_index(tokens)
    return inv_obj

def probe(tokens, n, invindex, y):
    probe_obj = Prober()
    probe_obj.probe(tokens, range(n), invindex, y)
    return probe_obj
        

In [105]:
import pandas as pd

C = pd.read_csv('songs_sample.csv')
D = pd.read_csv('tracks_sample.csv')

In [106]:
len(C), len(D)

(10000, 10000)

In [117]:
lconcat_strings = preprocess_table(C)
D1 = D.sample(1000, replace=False)
rconcat_strings = preprocess_table(D1)
stopwords=['san', 'st', 'francisco']
ltokens = tokenize_strings(lconcat_strings, stopwords)
rtokens = tokenize_strings(rconcat_strings, stopwords)
inv_index = build_inv_index(ltokens)

probe_res = probe(rtokens, rtokens.size(), inv_index, 1)


In [120]:
rids = (probe_res.get_rids())

In [121]:
D1.iloc[rids]

Unnamed: 0,id,title,year,episode,song,artists
8101,59408,Dancing with the Stars,2005.0,Week 5: The Results (#16.10),Better Dig Two,the band perry+brandy clark+shane mcanally+trevor rosen
5474,471564,Hokuto no Ken,1986.0,,Purple Eyes,tsuyoshi ujiki+kodomo band
5577,216821,So You Think You Can Dance,2005.0,The Top Eight Perform (#6.21),People Are Strange,the doors
2631,463920,Happy Land,1943.0,,How Ya Gonna Keep em Down on the Farm (After Theyve Seen Paree?),walter donaldson
6326,191790,Rage,1987.0,Zakk Wylde Guest Programs Rage,Hells Bells,angus young+malcolm young+brian johnson+ac/dc
79,567768,Os Famosos e os Duendes da Morte,2009.0,,Pigeon Suicide Squad,nelo johann
2167,93777,Full House,1987.0,Sea Cruise (#1.5),The Love Boat,bob saget+dave coulier
9939,55238,Daily Ukulele,2013.0,Otherside - Red Hot Chili Peppers (#1.74),Otherside,flea+john frusciante+anthony kiedis+chad smith+olivia vessel
3888,375314,Bra Boys,2007.0,,In the Shadow of Long Bay II,jamie holt
7401,330426,6 Month Rule,2011.0,,Land of Feeling,luke temple breneman+here we go magic


In [73]:
lconcat_strings.get(0)
ltokens.get(0)
inv_index.values('francisco')

[0, 1, 2, 3, 4]