In [2]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers
import blocker, custom_matcher

magellan version:0.1.0





In [3]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [3]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_city,a_prov,a_country,a_email_server,a_email_server_cleaned
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,academy management,briarcliff manor,new york,united states,aom.org,aom
4,northeastern university,boston,massachusetts,united states,gmail.com,gmail
5,skidmore college,saratoga springs,new york,united states,skidmore.edu,skidmore


### 1.B. Clean WHED data

In [4]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')
whed_cleaner.clean_states('a_prov', 'a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_country,a_city,a_prov,a_web
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,pampanga state agricultural university,philippines,magalang,pampanga,http://www.pac.edu.ph
4,les roches international school hotel management,switzerland,bluche crans montana,bluche-crans-montana,http://www.lesroches.edu
6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu


# STEP 2 - MAGELLAN - BLOCKING

In [9]:
AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)

# building inverted index based on B
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=4000, y_param=3)
print(len(sample_WHED))
print(len(sample_AOM))

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
0%                          100%
[##############################] | ETA: 00:00:19 | ETA: 00:00:20 | ETA: 00:00:19 | ETA: 00:00:18 | ETA: 00:00:18 | ETA: 00:00:17 | ETA: 00:00:16 | ETA: 00:00:16 | ETA: 00:00:15 | ETA: 00:00:14 | ETA: 00:00:13 | ETA: 00:00:13 | ETA: 00:00:12 | ETA: 00:00:11 | ETA: 00:00:10 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00

4943
4000



Total time elapsed: 00:00:21


In [7]:
sample_WHED.head(n=5)

Unnamed: 0,a_id,a_name,a_country,a_city,a_prov,a_web
8192,11551,meio university,japan,nago shi,okinawa,http://www.meio-u.ac.jp
8194,11554,mejiro university,japan,shinjuku ku,tokyo,http://www.mejiro.ac.jp
3,7,a d patel institute technology,india,vitthal udyognagar,gujarat,http://www.adit.ac.in
16386,21850,zhytomyr state ivan franko university,ukraine,zytomyr,zytomyr region,http://www.academy.zt.ua
5,9,aalborg university,denmark,aalborg,,http://www.aau.dk


In [9]:
sample_AOM.head(n=5)

Unnamed: 0,person_id,a_name,a_city,a_prov,a_country,a_email_server,a_email_server_cleaned
6210,46879,university maryland,college park,maryland,united states,gmail.com,gmail
7241,53726,tulane,new orleans,louisiana,united states,tulane.edu,tulane
6383,47937,university zurich,zurich,switzerland,switzerland,uzh.ch,uzh
1656,7807,university texas pan american,mcallen,texas,united states,utpa.edu,utpa
4947,37439,boston university,boston,massachusetts,united states,bu.edu,bu


In [11]:
C = blocker.blocking(sample_WHED, sample_AOM, csv_headers.WHED, csv_headers.AOM)

print(C.size)

0%                          100%
[##############################] | ETA: 00:08:46 | ETA: 00:08:20 | ETA: 00:08:09 | ETA: 00:07:47 | ETA: 00:07:30 | ETA: 00:07:16 | ETA: 00:06:55 | ETA: 00:06:36 | ETA: 00:06:22 | ETA: 00:06:03 | ETA: 00:05:45 | ETA: 00:05:32 | ETA: 00:05:12 | ETA: 00:04:57 | ETA: 00:04:39 | ETA: 00:04:19 | ETA: 00:04:01 | ETA: 00:03:41 | ETA: 00:03:23 | ETA: 00:03:04 | ETA: 00:02:44 | ETA: 00:02:25 | ETA: 00:02:07 | ETA: 00:01:48 | ETA: 00:01:31 | ETA: 00:01:13 | ETA: 00:00:55 | ETA: 00:00:36 | ETA: 00:00:18 | ETA: 00:00:00 | ETA: 00:00:00

129675



Total time elapsed: 00:09:14


In [12]:
C.head(n=1000)

Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_country,ltable_a_city,ltable_a_prov,ltable_a_web,rtable_a_name,rtable_a_country,rtable_a_city,rtable_a_prov,rtable_a_email_server
0,0,10,53416,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aaltoee.fi
1,1,10,54470,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto.fi
2,2,10,29724,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,aalto.fi
3,3,10,58343,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto.fi
4,4,10,17806,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,aalto.fi
5,5,10,59274,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto.fi
6,6,10,56107,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,tkk.fi
7,7,10,38224,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,aalto.fi
8,8,10,54387,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,helsinki,aalto.fi
9,9,10,36477,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,aalto.fi


In [13]:
em.to_csv_metadata(C, path_to_csv_dir + 'matching_pairs_table_overlap3_emailserver.csv')
em.save_table(C, path_to_csv_dir + 'matching_pairs_table_overlap3_emailserver.pkl')

sample_AOM.to_csv(path_to_csv_dir + 'sample_AOM.csv', encoding = 'UTF-8', index_label = csv_headers.AOM_INDEX)
sample_WHED.to_csv(path_to_csv_dir + 'sample_WHED.csv', encoding = 'UTF-8', index_label = csv_headers.WHED_INDEX)

File already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table_overlap3_emailserver.csv; Overwriting it
Metadata file already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table_overlap3_emailserver.metadata. Overwriting it
File already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table_overlap3_emailserver.pkl; Overwriting it
Metadata file already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table_overlap3_emailserver.pklmetadata. Overwriting it


# STEP 3 - MAGELLAN - MATCHING

In [1]:
# Load data
AOM = em.read_csv_metadata(path_to_csv_dir + '_aom.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed.csv', key = csv_headers.WHED_INDEX)
labeled_data = em.read_csv_metadata(path_to_csv_dir + 'golden_data_labeled_nomissing.csv', key='_id',
                                                                 ltable=WHED, rtable=AOM, 
                                                                 fk_ltable='ltable_' + csv_headers.WHED_INDEX, 
                                                                 fk_rtable='rtable_' + csv_headers.AOM_INDEX,encoding = "ISO-8859-1")

# Split train and test set
IJ = em.split_train_test(labeled_data, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

NameError: name 'em' is not defined

In [21]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
nb = em.SVMMatcher(name='NaiveBayes', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [38]:
# Generate a set of features
F = em.get_features_for_matching(WHED, AOM)

em.add_blackbox_feature(F, 'is_same_server', custom_matcher.is_same_server)
em.add_blackbox_feature(F, 'is_same_single_server', custom_matcher.is_same_single_server)

# F.feature_name

True

In [23]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  

In [24]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [25]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [47]:
# Select the best ML matcher using CV
# Precision Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='precision', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11009ab70>,5,0.882353,0.769231,0.941176,0.933333,0.85,0.875219
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11009a4a8>,5,1.0,0.909091,0.947368,0.882353,1.0,0.947762
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009aa20>,5,0.846154,1.0,0.933333,1.0,1.0,0.955897
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009ae48>,5,0.846154,1.0,0.933333,1.0,1.0,0.955897
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10f65d4e0>,5,1.0,0.727273,0.9,1.0,1.0,0.925455
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11009a128>,5,0.933333,0.769231,0.894737,0.933333,0.9,0.886127


In [43]:
# Recall Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='recall', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11009ab70>,5,0.9375,0.909091,0.888889,0.823529,0.809524,0.873707
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11009a4a8>,5,0.875,0.909091,1.0,0.882353,0.809524,0.895194
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009aa20>,5,0.6875,0.363636,0.777778,0.529412,0.190476,0.50976
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009ae48>,5,0.6875,0.363636,0.777778,0.529412,0.190476,0.50976
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10f65d4e0>,5,0.875,0.727273,1.0,0.764706,0.857143,0.844824
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11009a128>,5,0.875,0.909091,0.944444,0.823529,0.857143,0.881842


In [46]:
# F1 Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11009ab70>,5,0.909091,0.833333,0.914286,0.875,0.829268,0.872196
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11009a4a8>,5,0.933333,0.909091,0.972973,0.882353,0.894737,0.918497
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009aa20>,5,0.758621,0.533333,0.848485,0.692308,0.32,0.630549
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11009ae48>,5,0.758621,0.533333,0.848485,0.692308,0.32,0.630549
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10f65d4e0>,5,0.933333,0.727273,0.947368,0.866667,0.923077,0.879544
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11009a128>,5,0.903226,0.833333,0.918919,0.875,0.878049,0.881705


In [44]:
# Convert the J into a set of feature vectors using F
K = em.extract_feature_vecs(J, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  
K = em.impute_table(K, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [52]:
# Choose Random Forest to train
rf.fit(table=H, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'], target_attr='gold_label')
predictions = rf.predict(table=K, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                append=True, target_attr='predicted', inplace=False)

eval_summary = em.eval_matches(predictions, 'gold_label', 'predicted')
em.print_eval_summary(eval_summary)

Precision : 98.63% (72/73)
Recall : 91.14% (72/79)
F1 : 94.74%
False positives : 1 (out of 73 positive predictions)
False negatives : 7 (out of 166 negative predictions)
