In [1]:
%load_ext cython

In [2]:
import guppy

In [3]:
from guppy import hpy
h = hpy()
h.setref()

In [4]:
import py_entitymatching as em
import cython

In [5]:
A = em.load_dataset('person_table_A')
B = em.load_dataset('person_table_B')

In [6]:
# preprocess table
# 1. concatenate strings, remove punctuations and convert to lower case

In [7]:
def get_str_cols(dataframe):
    return dataframe.columns[dataframe.dtypes == 'object']

In [67]:
%%cython
#distutils: language=c++
from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libcpp cimport bool
from libcpp.set cimport set as oset
from libcpp.map cimport map as omap
from cython.parallel cimport prange
from libcpp.algorithm cimport sort
from cython.operator cimport dereference, preincrement
import cython
import cloudpickle

ctypedef cython.int int


cdef class StringContainer:

    cdef vector[pair[int,string]] container 
    cdef long size_in_bytes
    
    cdef void _push_back(self, int i, string s):
        self.container.push_back((i, s))
    cdef vector[pair[int,string]] _get(self):
        return self.container
    cdef pair[int, string] _get_index(self, int i) nogil:
        return self.container[i]
    cdef int _get_size(self) nogil:
        return self.container.size()
    
    cdef long _compute_size(self):
        cdef int i
        cdef long s = 0        
        cdef int_size = 4
        if self.size_in_bytes == 0:
            for i in range(self._get_size()):
                s += len(self.container[i].second)
                s += int_size
            self.size_in_bytes = s
        return self.size_in_bytes
            
        
        
        

        
        
    def push_back(self, int i, string s):
        self.container.push_back((i,s))
    def get(self):
        return self.container
    def get_index(self, int i):
        return self.container[i]
    def get_size(self):
        return self._get_size()
    
    def __sizeof__(self):
        return self._compute_size()

cdef extern from "string.h" nogil:                                                    
    #    char *strtok (char *inp_str, const char *delimiters)  
        char *strtok_r (char *inp_str, const char *delimiters, char **) 
# cdef extern from "<algorithm>" namespace "std" nogil:
#         void sort(vector[int].iterator, vector[int].iterator, bool func (pair[int, int], pair[int, int]))
        

cdef class TokenContainer:
    cdef vector[pair[int, vector[string]]] container
    cdef long size_in_bytes
    
    cdef int _get_size(self) nogil:
        return self.container.size()

    cdef vector[pair[int,vector[string]]] _get(self):
        return self.container
    
    cdef pair[int, vector[string]] _get_index(self, int i) nogil:
        return self.container[i]
    
    cdef long _compute_size(self):
        cdef int n = self._get_size()
        cdef int i, j
        cdef int m
        cdef long s = 0
        cdef int int_size = 4
        cdef vector[string] tokens
        print(cython.sizeof(4))
        if self.size_in_bytes == 0:
            for i in xrange(n):
                s += int_size 
                tokens = self.container[i].second
                m = tokens.size()
                for j in xrange(m):
                    s += len(tokens[j])
            self.size_in_bytes = s
        return self.size_in_bytes
                
                
    
    cdef vector[string] _stokenize(self, const string& inp_string) nogil:
        cdef char* ptr1
        cdef char* pch = strtok_r(<char*> inp_string.c_str(), " \t\n", &ptr1)
        cdef oset[string] tokens
        cdef vector[string] out_tokens
        cdef string s
        while pch != NULL:
            tokens.insert(string(pch))
            pch = strtok_r(NULL, " \t\n", &ptr1)
        for s in tokens:
            out_tokens.push_back(s)
        return out_tokens
            
    cdef vector[string] _remove_stopwords(self, vector[string] &inp_tokens, const omap[string, int] &stop_words) nogil:
        cdef vector[string] out_tokens
        cdef string token
        for token in inp_tokens:
            if (stop_words.find(token) == stop_words.end()):
                out_tokens.push_back(token)
        return out_tokens
    
    cdef void _tokenize(self, StringContainer container, omap[string, int] stopwords) nogil:
        cdef int n = container._get_size()
        cdef int i
        cdef string s
        cdef int uid
        cdef pair[int, string] p
        cdef vector[string] out_tokens
        for i in xrange(n):
            self.container.push_back(pair[int, vector[string]]())
        for i in prange(n, nogil=True):
            p = container._get_index(i)
            uid = p.first
            s = p.second
            out_tokens = self._stokenize(s)
            out_tokens = self._remove_stopwords(out_tokens, stopwords)
            self.container[i] = pair[int, vector[string]](uid, out_tokens)
            
    def get_index(self, int i):
        return self._get_index(i)
    
    def get_size(self):
        return self._get_size()
                        
    def tokenize(self, StringContainer concat_strings, stopwords):
        import string as pstring
        cdef omap[string, int] stopword_map
        str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')        
        if len(stopwords):
            for s in stopwords:                
                stopword_map[s] = 0
        print('Calling Tokenize')

        
        self._tokenize(concat_strings, stopword_map)
    def __sizeof__(self):
        return self._compute_size()

    
    

# inverted index
cdef class InvertedIndex:
    cdef omap[string, vector[int]] index
    cdef long size_in_bytes
    
    cdef vector[int] _get_values(self, string token) nogil:
        cdef vector[int] dummy
        if self.index.find(token) != self.index.end():
            return self.index[token]
        else:
            return dummy
    cdef int _size(self) nogil:
        return self.index.size()
    
#     cdef long _compute_size(self):
#         cdef int n = self._size()
#         cdef int i
#         cdef long s = 0
#         cdef omap[string, vector[int]].iterator it = index.begin()
#         while it != index.end():
            
        
        
    
    
    cdef void _build_inv_index(self, TokenContainer objtc) nogil:
        cdef int n = objtc._get_size()
        cdef int m
        cdef int i,j
        cdef int idx
        cdef vector[string] tokens
        cdef string token
        cdef pair[int, vector[string]] p
        for i in xrange(n):
            p = objtc._get_index(i)
            idx = p.first
            tokens = p.second
            m = tokens.size()
            for j in xrange(m):
                self.index[tokens[j]].push_back(idx)
    
    
    def build_inv_index(self, objtc):
        self._build_inv_index(objtc)
        
    def size(self):
        return self._size()
    
    def get_values(self, token):
        return self._get_values(token)
    def get_inv_index(self):
        return self.index
        

cdef bool comp(const pair[int, int] l, const pair[int, int] r):
    return l.second > r.second    

# probe
cdef class Prober:
    cdef vector[pair[int, int]] pair_indices
    

    
    cdef int _size(self):
        return self.pair_indices.size()
    
    cdef pair[int, int] _get_index(self, int i):
        return self.pair_indices[i]
    
    cdef vector[int] _get_ltable_indices(self):
        cdef oset[int] set_indices
        cdef vector[int] l_indices
        cdef int n = self._size()
        cdef int i
        cdef oset[int].iterator it
        for i in xrange(n):
            set_indices.insert(self.pair_indices[i].first)
        it = set_indices.begin()
        while it != set_indices.end():
            l_indices.push_back(dereference(it))
            preincrement(it)
        sort(l_indices.begin(), l_indices.end())
        return l_indices

    cdef vector[int] _get_rtable_indices(self):
        cdef oset[int] set_indices
        cdef vector[int] r_indices
        cdef int n = self._size()
        cdef int i
        cdef oset[int].iterator it
        for i in xrange(n):
            set_indices.insert(self.pair_indices[i].second)
        it = set_indices.begin()
        while it != set_indices.end():
            r_indices.push_back(dereference(it))
            preincrement(it)
        sort(r_indices.begin(), r_indices.end())
        return r_indices
        
    
    cdef void _probe(self, TokenContainer inp_token_list, InvertedIndex index, int y):
        cdef int m = inp_token_list._get_size()
        cdef int i, j, k, l, r, s
        cdef pair[int, vector[string]] p_id_tokens
        cdef int uid
        cdef int n, o, q
        cdef vector[string] tokens
        cdef string token
        cdef vector[int] candidates
        cdef vector[pair[int, int]] to_sort
        cdef omap[int, int] candidate_overlap
        cdef omap[int,int].iterator it, end
        
        for i in xrange(m):
            p_id_tokens = inp_token_list._get_index(i)
            uid = p_id_tokens.first
            tokens = p_id_tokens.second
            n = tokens.size()
            for j in xrange(n):
                token = tokens[j]
#                 print(token)
                candidates = index._get_values(token)
                o = candidates.size()
                for k in xrange(o):
                    candidate_overlap[candidates[k]] += 1
            it = candidate_overlap.begin()
            end = candidate_overlap.end()
            while it != end:
                to_sort.push_back(dereference(it))
                preincrement(it)
            sort(to_sort.begin(), to_sort.end(), comp)
            q = 0
            # print 
#             for k in xrange(to_sort.size()):
#                 print('{0}: {1}'.format(to_sort[k].first, to_sort[k].second))
            for k in xrange(to_sort.size()):
                if q == y:
                    break
                
                self.pair_indices.push_back(pair[int, int](to_sort[k].first, uid))
                q += 1
            candidate_overlap.clear()
            to_sort.clear()

    def probe(self, TokenContainer inp_token_list, InvertedIndex index, int y):
        self._probe(inp_token_list, index, y)
    def get_ltable_indices(self):
        return self._get_ltable_indices()
    def get_rtable_indices(self):
        return self._get_rtable_indices()
    def get_all(self):
        return self.pair_indices
    def get_index(self, i):
        return self._get_index(i)

In [68]:
import string
def preprocess_table(dataframe):
    str_cols = get_str_cols(dataframe)
    projected_df = dataframe[str_cols]
    concat_strings = []
    str2bytes = lambda x: x if isinstance(x, bytes) else x.encode('utf-8')
    str_container_obj = StringContainer()
    for row in projected_df.itertuples(name=None):
        idx = row[0]
        joined_row = ' '.join(row[1:])
        joined_row = joined_row.translate(None, string.punctuation)
        concat_strings.append(joined_row.lower())
        str_container_obj.push_back(idx, str2bytes(joined_row.lower()))
        
    return str_container_obj
        
        

In [69]:
def tokenize_strings(concat_strings, stopwords):
    n = concat_strings.get_size()
    tok_container_obj = TokenContainer()
    tok_container_obj.tokenize(concat_strings, stopwords)
    return tok_container_obj
        

In [70]:
def build_inv_index(tokens):
    inv_obj = InvertedIndex()
    inv_obj.build_inv_index(tokens)
    return inv_obj

In [71]:
def probe(tokens, invindex, y):
    probe_obj = Prober()
    probe_obj.probe(tokens, invindex, y)
    return probe_obj

In [72]:
lconcat_strings = preprocess_table(A)
rconcat_strings = preprocess_table(B)
stopwords=['san', 'st', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4']
ltokens = tokenize_strings(lconcat_strings, stopwords)
rtokens = tokenize_strings(rconcat_strings, stopwords)
y = 1
inv_index = build_inv_index(ltokens)
probe_result = probe(rtokens, inv_index, y)


Calling Tokenize
Calling Tokenize


In [73]:
probe_result.get_all()

[(0, 0), (2, 1), (1, 2), (0, 3), (4, 4), (1, 5)]

In [74]:
ltable_indices = probe_result.get_ltable_indices()
rtable_indices = probe_result.get_rtable_indices()

In [75]:
rtable_indices, ltable_indices

([0, 1, 2, 3, 4, 5], [0, 1, 2, 4])

In [76]:
import sys

In [77]:
sys.getsizeof(rtokens)

4


221

In [146]:
import cython

In [147]:
sys.getsizeof(cython.int)

64

Unnamed: 0,ID,name,birth_year,hourly_wage,address,zipcode
0,a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
1,a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
2,a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
3,a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
4,a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122


In [11]:
objs.get_index(0)

(0, 'a1 kevin smith 607 from st san francisco')

In [12]:
objtc = tokenize_strings(objs, ['san'])

Calling Tokenize


In [15]:
idx = inv_index(objtc)

In [18]:
idx.get_values('francisco')

[0, 1, 2, 3, 4]