In [14]:
import pandas as pd
import string
import math
import numpy as np

In [3]:
import dmagellan
from dmagellan.utils.py_utils.utils import build_inv_index, tokenize_strings
from dmagellan.tokenizer.whitespacetokenizer import WhiteSpaceTokenizer
from dmagellan.utils.cy_utils.stringcontainer import StringContainer
from dmagellan.blocker.overlap.overlapblocker import OverlapBlocker

In [2]:
# functions
def remove_stopwords(tokens, stopwords):
    out_tokens = []
    for token in tokens:
        if not stopwords.has_key(token):
            out_tokens.append(token)
    return out_tokens
def process_column(column, stop_words):
    column = column.str.translate(None, string.punctuation)
    column = column.str.lower()
    if stop_words:
        dict_stopwords = dict(zip(self.stop_words, [0] * len(self.stop_words)))
        partial_rm_stopwords_fn = partial(remove_stopwords,
                                          stopwords=dict_stopwords)
        column = column.str.split().map(partial_rm_stopwords_fn).str.join(' ')
    return column


In [6]:
ltable = pd.read_csv('../../datasets/sample_citeseer_100k.csv')
rtable = pd.read_csv('../../datasets/sample_dblp_100k.csv')

lid, rid = 'id', 'id'
l_block_attr, r_block_attr = 'title', 'title'
tok = WhiteSpaceTokenizer()

nbins = 10
sample_proportion = 0.1
seed = 0
stopwords=[]


ob = OverlapBlocker()
p = ob.process_and_tokenize_ltable(ltable, 'id', 'title', tok, [])
inv_index = build_inv_index([p])


sample ltable

In [11]:
    ltbl = ltable[[lid, l_block_attr]]
    ltbl = ltbl[~ltbl[l_block_attr].isnull()]
    #process columns
    ltbl[l_block_attr] = process_column(ltbl[l_block_attr], stopwords)
    n = int(math.floor(sample_proportion*len(ltable)))

In [13]:
    # get the string lengths
    ltbl['str_len'] = ltbl.title.str.len()
    groups = ltbl.groupby('str_len')

    len_ids = {}
    for gid, g in groups:
        len_ids[gid] = list(g[lid].values)
    strlens = list(ltbl['str_len'].values)
    strlens += [max(strlens) + 1]


In [19]:
    # get boundaries for equal width histogram
    freq, edges = np.histogram(strlens, bins=nbins)

    bins = [[] for _ in range(nbins)]
    keys = sorted(len_ids.keys())
    # find the bins where the keys should land.
    positions = np.digitize(keys, edges)


261

In [22]:
# populate the ids of ltable in their corresponding bins
for i in range(len(keys)):
    k, p = keys[i], positions[i]
    bins[p - 1].extend(len_ids[k])
len_bins = [len(bins[i]) for i in range(len(bins))]


In [38]:
    # Compute the weight of each bin, based on the number of tuples in that bin and the total number of tuples
    weights = [len_bins[i] / float(sum(len_bins)) for i in range(len(bins))]
    # Based on the weights, find the number of tuples to be sampled from each bin
    numtups = [int(math.ceil(weights[i] * n)) for i in range(len(weights))]

In [39]:
    # Based on the num. of tuples to be sampled from each bin, take a random sample
    sampled = []
    for i in range(len(bins)):
        nt = numtups[i]
        np.random.seed(0)
        if len(bins[i]):
            np.random.seed(seed)
            tmp = np.random.choice(bins[i], nt)
            if len(tmp):
                sampled.extend(tmp)


In [40]:
ltable.set_index(lid, inplace=True, drop=False)
ltable['_pos'] = list(range(len(ltable)))
s_ltable = ltable.loc[sampled]
s_ltable = s_ltable.sort_values(['_pos'])
s_ltable.reset_index(drop=True, inplace=True)
s_ltable.drop(['_pos'], axis=1, inplace=True)
s_ltable.head(), len(s_ltable)

(        id                                              title  \
 0  1008177                       Rank-Tolerance Graph Classes   
 1   435899  A Versatile Incompressible Navier Stokes Solve...   
 2  1732361  MBT04 Preliminary Version Lightweight Specific...   
 3   264023  Exploiting Model Uncertainty Estimates for Saf...   
 4   173582  Robust Network Connectivity: when its the big ...   
 
                                              authors journal  month    year  \
 0        Martin Charles, Golumbic Robert, E. Jamison     NaN    NaN  2003.0   
 1  Marc Garbey, Francois Pacull, Keywords Navier ...     NaN    NaN  2006.0   
 2                         Seung Mo Cho, Jae Wook Lee     NaN    NaN     NaN   
 3                                    Je G. Schneider     NaN    NaN     NaN   
 4                      Enoch Peserico, Larry Rudolph     NaN    NaN     NaN   
 
   publication_type  
 0              NaN  
 1              NaN  
 2              NaN  
 3              NaN  
 4        

sample rtable

In [67]:
nbins = 10

In [68]:
    ob = OverlapBlocker()
    tok = WhiteSpaceTokenizer()
    # rtbl = rtable.reset_index(drop=True)
    rtbl = rtable[[rid, r_block_attr]]
    # rtbl['_pos'] = list(range(len(tbl)))
    p = ob.process_and_tokenize_ltable(rtbl, rid, r_block_attr, tok, stopwords)


In [69]:
    tok_cnt = {}
    tok_map = {}
    for i in range(p.size()):
        tid, tokens = p.get(i)
        cnt = 0
        for tok in tokens:
            if tok not in tok_map:
                tok_map[tok] = len(inv_index.values(tok))
            cnt += tok_map[tok]
        tok_cnt[tid] = cnt


In [70]:
    df =  pd.DataFrame(tok_cnt.items(), columns=['id', 'count'])
    groups = df.groupby('count')
    cnt_ids = {}
    for gid, g in groups:
        cnt_ids[gid] = list(g[lid].values)
    cnts = list(df['count'].values)
    cnts += [max(cnts) + 1]


In [71]:
    freq, edges = np.histogram(cnts, bins=nbins)
    n = int(math.floor(sample_proportion * len(rtable)))
    bins = [[] for _ in range(nbins)]
    keys = sorted(cnt_ids.keys())
    positions = np.digitize(keys, edges)


In [72]:
    for i in range(len(keys)):
        k, p = keys[i], positions[i]
        bins[p - 1].extend(cnt_ids[k])
    len_bins = [len(bins[i]) for i in range(len(bins))]


In [73]:
    weights = [len_bins[i] / float(sum(len_bins)) for i in range(len(bins))]
    numtups = [int(math.ceil(weights[i] * n)) for i in range(len(weights))]


In [74]:
    sampled = []
    for i in range(len(bins)):
        nt = numtups[i]
        np.random.seed(seed)
        if len(bins[i]):
            tmp = np.random.choice(bins[i], nt)
            if len(tmp):
                sampled.extend(tmp)


In [81]:
    rtable['_pos'] = list(range(len(rtable)))
    rtable.set_index(rid, inplace=True, drop=False)
    s_rtable = rtable.loc[sampled]
    s_rtable = s_rtable.sort_values('_pos')
    s_rtable.drop(['_pos'], axis=1, inplace=True)
    rtable.drop(['_pos'], axis=1, inplace=True)
