In [3]:
import pandas as pd


In [2]:
import inspect
def get_default_args(func):
    """
    returns a dictionary of arg_name:default_values for the input function
    """
    args, varargs, keywords, defaults = inspect.getargspec(func)
    return dict(zip(args[-len(defaults):], defaults))
def get_func_args(func):
    return inspect.getargspec(func)[0]
def get_class_func_name(command):
    name = command.__name__
    cls_name = None
    if name == 'block_tables':
        cls_name = command.im_class.__name__
    return name, cls_name
sample_size_setting = {
    'block_tables':{
        'AttrEquivalenceBlocker':[(0.1, 0.1), (0.3, 0.3), (0.5, 0.5), (0.7, 0.7), (0.9,
                                                                                   0.9)],
        'RuleBasedBlocker':[(1, 0.1), (1, 0.3), (1, 0.5), (1, 0.7), (1, 0.9)],
        'BlacBoxBlocker':[(0.01, 0.01), (0.05, 0.05), (0.1, 0.1), (0.15, 0.15), (0.2,
                                                                                 0.2)],
        'OverlapBlocker':[(1, 0.1), (1, 0.3), (1, 0.5), (1, 0.7), (1, 0.9)],
    },
    'block_candset': [0.01, 0.05, 0.1, 0.15, 0.2],
    'downsample':[(1, 0.2), (1, 0.3), (1, 0.5), (1, 0.7), (1, 0.9)],
    'extract_feature_vecs': [0.01, 0.05, 0.1, 0.15, 0.2],
    'predict':[0.01, 0.05, 0.1, 0.15, 0.2]
}


In [32]:
def sample_table(table, proportion):
    num_tuples = int(math.ceil(len(table)*proportion))
    if num_tuples > len(table):
        num_tuples = len(table)
    sampled_table = table.sample(num_tuples)
    sampled_table.sort_index(inplace=True)
    return sampled_table

def sample_tables(A, B, proportions):
    prop_a, prop_b = proportions[0], proportions[1]
    num_tuples_a = int(math.ceil(len(A)*prop_a))
    num_tuples_b = int(math.ceil(len(B)*prop_b))
    if num_tuples_a > len(A):
        num_tuples_a = len(A)
    if num_tuples_b > len(B):
        num_tuples_b = len(B)
    sampled_table_a = A.sample(num_tuples_a)
    sampled_table_b = A.sample(num_tuples_b)    
    sampled_table_a.sort_index(inplace=True)
    sampled_table_b.sort_index(inplace=True)    
    return sampled_table_a, sampled_table_b

In [39]:
import time
import math


class Timer(object):
    def __enter__(self):
        self.start = time.clock()
        return self
    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start


def execute(p):
    args = ()
    if isinstance(p, (list, tuple)):
        f, kwargs = (p[0], p[1])
    f, t = None, None

    try:
        with Timer() as t:
            res = f(*args, **kwargs)
            print(len(res))
    finally:
        return t.interval


def time_command(command, kwargs):
    print('inside time command')
    p = (command, kwargs)
    return execute(p)

In [11]:
import dmagellan

In [12]:
from dmagellan.blocker.attrequivalence.attr_equiv_blocker import AttrEquivalenceBlocker
from dmagellan.sampler.downsample.downsample import downsample_dk

In [15]:
A = pd.read_csv('../datasets/sample_citeseer_100k.csv')
B = pd.read_csv('../datasets/sample_dblp_100k.csv')
ab = AttrEquivalenceBlocker()

In [41]:
command = ab.block_tables
input_args = {'ltable':A, 'rtable':B,  
        'l_block_attr':'year', 'r_block_attr':'year', 
        'l_key':'id', 'r_key':'id', 
        'nltable_chunks': [1, 2, 4], 'nrtable_chunks': [4, 2, 1],
        'compute':True
     }
do_cartesian = False
repeat = 1

In [22]:
# preprocess args
default_args = get_default_args(command)
function_args = get_func_args(command)
if 'self' in function_args:
    function_args.remove('self') 
required_args = set(function_args).difference(default_args.keys())

missing_args = set(required_args).difference(input_args.keys())
if len(missing_args):
    print('The following args are required: ' + str(missing_args))
args = default_args
for key, value in input_args.iteritems():
    args[key] = value

In [42]:
# handle multiple num chunks value
import itertools
assert('nltable_chunks' in args)
assert('nrtable_chunks' in args)
ltable_setting = args['nltable_chunks']
if not isinstance(ltable_setting, list):
    ltable_setting = [ltable_setting]
rtable_setting = args['nrtable_chunks']
if not isinstance(rtable_setting, list):
    rtable_setting = [rtable_setting]

if do_cartesian:
    config_setting = list(itertools.product(ltable_setting, rtable_setting))
else:
    assert(len(ltable_setting) == len(rtable_setting))
    config_setting = zip(ltable_setting, rtable_setting)
function_name, class_name = get_class_func_name(command)
if class_name is not None:
    sample_sizes = sample_size_setting[function_name][class_name]
else:
    sample_sizes = sample_size_setting[function_name]


In [46]:
best_one = -1
for sample_size in sample_sizes:
    print(sample_size)
    result = []
    for config in config_setting:
        cum_runtime = 0
        for count in range(repeat):
            sampled_table_a, sampled_table_b = sample_tables(A, B, config)
            if function_name == 'downsample':
                if args['size'] > len(sampled_table_b):
                    args['size'] = len(sampled_table_b)
            args['ltable'] = sampled_table_a
            args['rtable'] = sampled_table_b            
            runtime = time_command(command, args)
            cum_runtime += runtime
#             print(runtime)
#         print(type(result))
        print(runtime, config)
        result.append((math.ceil(cum_runtime/float(repeat)), config))
        
    result.sort()
    if best_one == -1:
        best_one = result
    elif best_one[1] == result[1]:
        break
    else:
        best_one = result
        continue
print(best_one)

(0.1, 0.1)
inside time command
(1.4000000000180535e-05, (1, 4))
inside time command
(2.19999999995224e-05, (2, 2))
inside time command
(1.799999999896329e-05, (4, 1))
(0.3, 0.3)
inside time command
(1.9999999999242846e-05, (1, 4))
inside time command
(9.999999999621423e-06, (2, 2))
inside time command
(1.9999999999242846e-05, (4, 1))
[(1.0, (1, 4)), (1.0, (2, 2)), (1.0, (4, 1))]
