In [1]:
import sys
sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique/')

In [37]:
import magellan as mg
import numpy as np

In [33]:
from magellan import MTable

In [38]:
# sample one table using random sampling
def sample_table(table, size, replace=False):
    """
    Sample MTable
    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.
    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table =  table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table

In [39]:
# Blocking
## plan : 
### A, B -----------overlap blocking 
#(title)-----------------------------> candset  |
### A, B ------------rule-based-blocking(title & director)
# ----------------> candset |----union--->candset


A = mg.read_csv('tableA.csv', key='ID')
B = mg.read_csv('tableB.csv', key='ID')
A1 = sample_table(A, 500) 
B1 = sample_table(B, 500)
ob = mg.OverlapBlocker()
ab = mg.AttrEquivalenceBlocker()

In [40]:
C = ab.block_tables(A1, B1, 'Title', 'Title', 'Title', 'Title')
C.head()

Unnamed: 0,_id,ltable.ID,rtable.ID,ltable.Title,rtable.Title
0,0,40,39,Barricade,Barricade
1,1,139,130,Extracted,Extracted
2,2,321,297,Cop Out,Cop Out
3,3,347,322,Flipped,Flipped
4,4,384,353,Kaboom,Kaboom


In [41]:
D = ob.block_tables(A1, B1, 'Title', 'Title', 
                    l_output_attrs=['Title'], r_output_attrs=['Title'])
D.head()

0%                          100%
[##############################] | ETA[sec]: 0.000 
Total time elapsed: 1.800 sec


Unnamed: 0,_id,ltable.ID,rtable.ID,ltable.Title,rtable.Title
0,0,8,2606,About Cherry,Much Ado About Nothing
1,1,8,2721,About Cherry,10 Things I Hate About You
2,2,9,180,Abraham Lincoln vs. Zombies,Lincoln
3,3,9,525,Abraham Lincoln vs. Zombies,Hoodwinked Too! Hood vs Evil
4,4,9,550,Abraham Lincoln vs. Zombies,The Lincoln Lawyer


In [42]:
feature_table = mg.get_features_for_blocking(A1, B1)
#feature_table



In [43]:
rb = mg.RuleBasedBlocker()
rb.add_rule(['Title_Title_lev(ltuple,rtuple) < 0.8',
             'Director_Director_nmw(ltuple,rtuple) < 0.8'], feature_table)

True

In [27]:
mg.init_jvm() #Do we even need this? Doesn't work on my machine.
# Pradap --- if doesn't work let me know what happens and error msg if any.



True

In [44]:
#WORKING PERFECTLY! EDIT DISTANCE!
E = rb.block_tables(A1, B1, 
                    l_output_attrs=['Title'], r_output_attrs=['Title'])
E.head()

0%                          100%
[##############################] | ETA[sec]: 0.000 
Total time elapsed: 37.313 sec


Unnamed: 0,_id,ltable.ID,rtable.ID,ltable.Title,rtable.Title
0,0,8,159,About Cherry,Comedy Dynamics Presents: Bill Hicks
1,1,8,1031,About Cherry,The Bounce Back
2,2,8,1421,About Cherry,Criminal Activities
3,3,8,1871,About Cherry,R5: All Day All Night
4,4,9,159,Abraham Lincoln vs. Zombies,Comedy Dynamics Presents: Bill Hicks


In [8]:
def year_blocker_function(x, y):
    x_date = str(x['Year'])
    y_date = str(y['Year'])
    if(x_date=='NULL' or y_date=='NULL' or x_date == 'nan' 
       or y_date == 'nan'):
        return False #Since one of them is NULL, 
    we cannot block using this feature
    x_year = int(x_date.split('-')[0])
    y_year = int(y_date.split('-')[0])
    
    if(x_year > y_year + 1 or x_year < y_year - 1):
        return True #Block the tuples
    
    return False

bb = mg.BlackBoxBlocker()
bb.set_black_box_function(year_blocker_function)

True

In [None]:
# F = bb.block_tables(A, B, 
#                     l_output_attrs=['Title'], 
#                     r_output_attrs=['Title'])

0%                          100%
[                              ]

In [29]:
#F #NOW F will have all movie pairs that have 
# been released within one year of each other
#We need to apply this along with other blockers 
#to make it practical

In [45]:
G = mg.combine_block_outputs_via_union([D, E])
G.head()

Unnamed: 0,_id,ltable.ID,rtable.ID,ltable.Title,rtable.Title
0,0,8,159,About Cherry,Comedy Dynamics Presents: Bill Hicks
1,1,8,1031,About Cherry,The Bounce Back
2,2,8,1421,About Cherry,Criminal Activities
3,3,8,1871,About Cherry,R5: All Day All Night
4,4,8,2606,About Cherry,Much Ado About Nothing


In [46]:
 corres_var = [('ID','ID'),('Title','Title'),('Director','Director')]
 mg.debug_blocker(A, B, G, attr_corres=corres_var, output_size=200)

0%                          100%
[##############################] | ETA[sec]: 0.000 
Total time elapsed: 21.566 sec


Unnamed: 0,_id,similarity,ltable.ID,rtable.ID,ltable.Title,rtable.Title,ltable.Director,rtable.Director
0,0,1,2760,2426,Before Midnight,Before Midnight,Richard Linklater,Richard Linklater
1,1,1,2762,2429,Beside Still Waters,Beside Still Waters,Chris Lowell,Chris Lowell
2,2,1,2767,2434,Big Sur,Big Sur,Michael Polish,Michael Polish
3,3,1,2770,2437,Blackfish,Blackfish,Gabriela Cowperthwaite,Gabriela Cowperthwaite
4,4,1,2763,2430,Best Kept Secret,Best Kept Secret,Samantha Buck,Samantha Buck
5,5,1,2768,2435,The Big Wedding,The Big Wedding,Justin Zackham,Justin Zackham
6,6,1,2769,2436,Black Nativity,Black Nativity,Kasi Lemmons,Kasi Lemmons
7,7,1,2777,2444,The Book Thief,The Book Thief,Brian Percival,Brian Percival
8,8,1,2791,2457,CBGB,CBGB,Randall Miller,Randall Miller
9,9,1,2764,2431,The Best Man Holiday,The Best Man Holiday,Malcolm D. Lee,Malcolm D. Lee
