### Import Required Packages

In [1]:
import pandas as pd
import numpy as np
from urllib2 import urlopen
from nltk.metrics import edit_distance, jaccard_distance
from sklearn.externals import joblib

## Creating Table E - Steps
1. Load blocked tuple pairs (1,217 pairs)
2. Load tables containing detailed information
    1. Spoj data (ltable)
    2. Codechef data (rtable)
3. Create table_D - blocked tuple pairs with detailed information
    1. exact_title_match
    2. diff_in_words_len (for the description)
    3. jaccard_sim (for the description)
    4. edit_dist (for the title)
4. Load and apply machine learning algorithm developed in project stage 4
    1. Logistic Regression Classifier
5. Create table_E with matching pairs from table_D
    1. 284 matches

In [2]:
# Step 1: Load blocked tuple pairs
blocked_set_url = "https://raw.githubusercontent.com/KaranTalreja/CS638/master/candidate_set.csv?token=AKuoLS_PLjUb7huP6wG6fd9O2YkUp_ICks5YWyaFwA%3D%3D"
blocked_set = pd.read_csv(urlopen(blocked_set_url)).set_index("_id")

In [3]:
# Step 2: Load Spoj & Codechef data (preprocessed in previous steps)
table_spoj = "spoj.csv"
ltable = pd.read_csv(table_spoj).drop("Unnamed: 0", axis=1)
ltable["words"] = ltable["description"].fillna("") + ltable["input"].fillna("") + ltable["output"].fillna("")
ltable = ltable.drop(["description", "input", "output"], axis=1).drop("difficulty_normalized", axis=1)

table_codechef = "codechef.csv"
rtable = pd.read_csv(table_codechef)
rtable["words"] = rtable["description"].fillna("") + rtable["input"].fillna("") + rtable["output"].fillna("")
rtable = rtable.drop(["description", "input", "output"], axis=1).drop("Unnamed: 0", axis=1)

In [4]:
# Step 3: Create table_D - blocked tuple pairs with detailed information
def exact_title_match(df):
    if ltable.ix[df["ltable_ID"]].title == rtable.ix[df["rtable_ID"]].title:
        return 1
    else:
        return 0
    
def words_length(df):
    l_word_len = len(ltable.ix[df["ltable_ID"]].words.split(" "))
    r_word_len = len(rtable.ix[df["rtable_ID"]].words.split(" "))
    return abs(l_word_len - r_word_len)

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def jaccard_A_B(df):
    l_word_len = ltable.ix[df["ltable_ID"]].words.split(" ")
    r_word_len = rtable.ix[df["rtable_ID"]].words.split(" ")
    return jaccard_similarity(l_word_len, r_word_len)

def title_edit_dist(df):
    l_title = ltable.ix[df["ltable_ID"]].title
    r_title = rtable.ix[df["rtable_ID"]].title
    return edit_distance(l_title, r_title)
    
table_D = blocked_set
table_D["exact_title_match"] = blocked_set.apply(exact_title_match, axis=1)
table_D["diff_in_words_len"] = blocked_set.apply(words_length, axis=1)
table_D["jaccard_sim"] = blocked_set.apply(jaccard_A_B, axis=1)
table_D["edit_dist"] = blocked_set.apply(title_edit_dist, axis=1)
table_D.head()

Unnamed: 0_level_0,ltable_ID,rtable_ID,exact_title_match,diff_in_words_len,jaccard_sim,edit_dist
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,5166,1,29,0.506329,0
1257,31,2205,0,65,0.802752,7
1470,64,5005,0,547,0.319549,7
1735,74,282,0,65,0.573964,6
1736,74,2885,0,44,0.716129,7


In [5]:
# Step 4: Load & Apply Logistic Regression Classifier trained in Stage 4
# Load Classifier
log_model = joblib.load('Stage 5/log_model.pkl')
log_pred = log_model.predict(table_D[["exact_title_match","diff_in_words_len","jaccard_sim","edit_dist"]])

# Apply Classifier
table_D["lr_pred_match"] = log_pred

# Select only matches
table_D_matches = table_D[table_D["lr_pred_match"] == 1][["ltable_ID","rtable_ID"]]
table_D_matches.head()

Unnamed: 0_level_0,ltable_ID,rtable_ID
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0,5166
1257,31,2205
1735,74,282
1736,74,2885
1856,102,3241


In [6]:
# Create table_E with matching pairs from table_D
def rtable_E_find(df):
    rt = rtable[rtable.index == df]
    data = pd.Series({
        "codechef_title": rt.title.values[0],
        "codechef_solve_rate_normalized": rt.solve_rate_normalized.values[0],
        "codechef_words": rt.words.values[0],
    })
    return data

def ltable_E_find(df):
    lt = ltable[ltable.index == df]
    data = pd.Series({
        "spoj_title": lt.title.values[0],
        "spoj_solve_rate_normalized": lt.solve_rate_normalized.values[0],
        "spoj_words": lt.words.values[0]
    })
    return data

table_Er = table_D_matches.rtable_ID.apply(rtable_E_find)
table_El = table_D_matches.ltable_ID.apply(ltable_E_find)
table_E = table_El.merge(table_Er, left_index=True, right_index=True)
table_E

Unnamed: 0_level_0,spoj_solve_rate_normalized,spoj_title,spoj_words,codechef_solve_rate_normalized,codechef_title,codechef_words
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,38.716293,TEST,Your program is to use the brute-force appro...,0.100593,TEST,"For help on this problem, please check out our..."
1257,-0.306566,CAPPIZZA,"Brunno Doiuna is very fond of caper pizzas, ...",0.024855,CDS008,"Brunno Doiuna is very fond of caper pizzas, wh..."
1735,6.717850,FASHION,A fashion show rates participants according ...,0.024889,CDSE03,A quiz show rates participants according to t...
1736,6.717850,FASHION,A fashion show rates participants according ...,0.024890,VITC04,Vibrance rates participants according to their...
1856,-0.036552,COINTOSS,You have an unbiased coin which you want t...,0.024879,SOPC04,You have an unbiased coin which you want to ke...
1903,-0.130034,HIGH,In some countries building highways takes a ...,0.024872,HIGH,In some countries building highways takes a lo...
1909,-0.280495,IOPC1200,It is after years that Dystopian Institute o...,0.024857,IOPC1200,It is after years that Dystopian Institute of ...
1922,-0.277442,IOPC1201,The Rubik's cube is perhaps the world's most...,0.024862,IOPC1201,The Rubik's cube is perhaps the world's most p...
2065,-0.255988,IOPC1202,You are given the coordinates of the vertice...,0.024866,IOPC1202,You are given the coordinates of the vertices ...
2066,-0.162250,IOPC1203,You must be familiar with the use of numeric...,0.024888,IOPC1203,You must be familiar with the use of numeric k...


In [7]:
table_E.to_csv("table_E.csv")