In [62]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers
import blocker, custom_matcher

magellan version:0.1.0


In [3]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [None]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

### 1.B. Clean WHED data

In [None]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')
whed_cleaner.clean_states('a_prov', 'a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

# STEP 2 - MAGELLAN - BLOCKING

In [None]:
AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)

# building inverted index based on B
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=4000, y_param=3)
print(len(sample_WHED))
print(len(sample_AOM))

In [None]:
sample_WHED.head(n=5)

In [None]:
sample_AOM.head(n=5)

In [None]:
C = blocker.blocking(sample_WHED, sample_AOM, csv_headers.WHED, csv_headers.AOM)

print(C.size)

In [None]:
C.head(n=1000)

In [None]:
em.to_csv_metadata(C, path_to_csv_dir + 'matching_pairs_table_overlap3_emailserver.csv')
em.save_table(C, path_to_csv_dir + 'matching_pairs_table_overlap3_emailserver.pkl')

sample_AOM.to_csv(path_to_csv_dir + 'sample_AOM.csv', encoding = 'UTF-8', index_label = csv_headers.AOM_INDEX)
sample_WHED.to_csv(path_to_csv_dir + 'sample_WHED.csv', encoding = 'UTF-8', index_label = csv_headers.WHED_INDEX)

# STEP 3 - MAGELLAN - MATCHING

In [4]:
# Load data
AOM = em.read_csv_metadata(path_to_csv_dir + '_aom.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed.csv', key = csv_headers.WHED_INDEX)
labeled_data = em.read_csv_metadata(path_to_csv_dir + 'golden_data_labeled_nomissing.csv', key='_id',
                                                                 ltable=WHED, rtable=AOM, 
                                                                 fk_ltable='ltable_' + csv_headers.WHED_INDEX, 
                                                                 fk_rtable='rtable_' + csv_headers.AOM_INDEX,encoding = "ISO-8859-1")

# Split train and test set
IJ = em.split_train_test(labeled_data, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [5]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
nb = em.SVMMatcher(name='NaiveBayes', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [6]:
# Generate a set of features
F = em.get_features_for_matching(WHED, AOM)

em.add_blackbox_feature(F, 'is_same_server', custom_matcher.is_same_server)
em.add_blackbox_feature(F, 'is_same_single_server', custom_matcher.is_same_single_server)

# F.feature_name

True

In [7]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  

In [8]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [9]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [10]:
# Select the best ML matcher using CV
# Precision Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='precision', random_state=0)
result['cv_stats']



Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x106fb13c8>,5,0.882353,0.769231,0.941176,0.933333,0.85,0.875219
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x106fb1550>,5,1.0,0.909091,0.947368,0.882353,1.0,0.947762
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1518>,5,0.846154,1.0,0.933333,1.0,1.0,0.955897
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1630>,5,0.846154,1.0,0.933333,1.0,1.0,0.955897
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x106fb1710>,5,1.0,0.727273,0.9,1.0,1.0,0.925455
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x106fb16d8>,5,0.933333,0.769231,0.894737,0.933333,0.9,0.886127


In [11]:
# Recall Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='recall', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x106fb13c8>,5,0.9375,0.909091,0.888889,0.823529,0.809524,0.873707
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x106fb1550>,5,0.875,0.909091,1.0,0.882353,0.809524,0.895194
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1518>,5,0.6875,0.363636,0.777778,0.529412,0.190476,0.50976
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1630>,5,0.6875,0.363636,0.777778,0.529412,0.190476,0.50976
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x106fb1710>,5,0.875,0.727273,1.0,0.764706,0.857143,0.844824
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x106fb16d8>,5,0.875,0.909091,0.944444,0.823529,0.857143,0.881842


In [12]:
# F1 Score
result = em.select_matcher([dt, rf, svm, nb, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x106fb13c8>,5,0.909091,0.833333,0.914286,0.875,0.829268,0.872196
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x106fb1550>,5,0.933333,0.909091,0.972973,0.882353,0.894737,0.918497
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1518>,5,0.758621,0.533333,0.848485,0.692308,0.32,0.630549
3,NaiveBayes,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x106fb1630>,5,0.758621,0.533333,0.848485,0.692308,0.32,0.630549
4,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x106fb1710>,5,0.933333,0.727273,0.947368,0.866667,0.923077,0.879544
5,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x106fb16d8>,5,0.903226,0.833333,0.918919,0.875,0.878049,0.881705


In [13]:
# Convert the J into a set of feature vectors using F
K = em.extract_feature_vecs(J, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  
K = em.impute_table(K, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [24]:
# Choose Random Forest to train
rf.fit(table=H, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'], target_attr='gold_label')

import pickle 
filename = 'rf.sav'
pickle.dump(rf, open(filename, 'wb'))


In [None]:
predictions = rf.predict(table=K, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                append=True, target_attr='predicted', inplace=False)

eval_summary = em.eval_matches(predictions, 'gold_label', 'predicted')
em.print_eval_summary(eval_summary)

### CHOOSE RANDOM FOREST ###

Below are the subsequent steps:
* Downsizing data - using whole AOM dataset with y_parameter of 3
* Blocking using the previous rule on the downsized dataset
* Applying the classifier on the blocked dataset

In [16]:
AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)

AOM.shape
# building inverted index based on B
final_WHED, final_AOM = em.down_sample(WHED, AOM, size=9532, y_param=3)

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
0%                          100%
[##############################] | ETA: 00:00:43 | ETA: 00:00:40 | ETA: 00:00:38 | ETA: 00:00:36 | ETA: 00:00:34 | ETA: 00:00:33 | ETA: 00:00:32 | ETA: 00:00:30 | ETA: 00:00:29 | ETA: 00:00:27 | ETA: 00:00:26 | ETA: 00:00:25 | ETA: 00:00:23 | ETA: 00:00:22 | ETA: 00:00:20 | ETA: 00:00:19 | ETA: 00:00:17 | ETA: 00:00:16 | ETA: 00:00:15 | ETA: 00:00:13 | ETA: 00:00:12 | ETA: 00:00:11 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:42


In [17]:
print(len(final_WHED))
print(len(final_AOM))

8691
9532


In [18]:
final_C = blocker.blocking(final_WHED, final_AOM, csv_headers.WHED, csv_headers.AOM)
final_C.head(n=5)

0%                          100%
[##############################] | ETA: 00:32:28 | ETA: 00:33:02 | ETA: 00:30:53 | ETA: 00:29:41 | ETA: 00:28:11 | ETA: 00:26:49 | ETA: 00:25:34 | ETA: 00:24:30 | ETA: 00:23:24 | ETA: 00:22:02 | ETA: 00:20:44 | ETA: 00:19:36 | ETA: 00:18:32 | ETA: 00:17:31 | ETA: 00:16:28 | ETA: 00:15:21 | ETA: 00:14:30 | ETA: 00:13:21 | ETA: 00:12:18 | ETA: 00:11:18 | ETA: 00:10:08 | ETA: 00:08:56 | ETA: 00:07:46 | ETA: 00:06:37 | ETA: 00:05:31 | ETA: 00:04:26 | ETA: 00:03:20 | ETA: 00:02:13 | ETA: 00:01:06 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:33:10


Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_city,ltable_a_prov,ltable_a_country,ltable_a_web,rtable_a_name,rtable_a_city,rtable_a_prov,rtable_a_country,rtable_a_email_server
0,0,9,54111,aalborg university,aalborg,,denmark,http://www.aau.dk,aalborg university,branbrand,,denmark,u-k.dk
1,1,9,54203,aalborg university,aalborg,,denmark,http://www.aau.dk,aalborg university,aalborg,,denmark,hum.aau.dk
2,2,10,29724,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,espoo,,finland,aalto.fi
3,3,10,14261,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,espoo,,finland,stratnet.org
4,4,10,43727,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,helsinki,helsinki,finland,aalto.fi


In [64]:
em.save_table(final_C, path_to_csv_dir + 'final_C.pkl')

True

### Load model and classify

In [79]:
print(final_C.shape)
final_C.head(n=1000)

(27887, 13)


Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_city,ltable_a_prov,ltable_a_country,ltable_a_web,rtable_a_name,rtable_a_city,rtable_a_prov,rtable_a_country,rtable_a_email_server
0,0,9,54111,aalborg university,aalborg,,denmark,http://www.aau.dk,aalborg university,branbrand,,denmark,u-k.dk
1,1,9,54203,aalborg university,aalborg,,denmark,http://www.aau.dk,aalborg university,aalborg,,denmark,hum.aau.dk
2,2,10,29724,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,espoo,,finland,aalto.fi
3,3,10,14261,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,espoo,,finland,stratnet.org
4,4,10,43727,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,helsinki,helsinki,finland,aalto.fi
5,5,10,58505,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,espoo,,finland,aalto.fi
6,6,10,1239,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,helsinki,,finland,aalto.fi
7,7,10,9489,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,helsinki,,finland,aalto.fi
8,8,10,50927,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,helsinki,,finland,aalto.fi
9,9,10,53416,aalto university,espoo,,finland,http://www.aalto.fi/fi/,aalto university,aalto,,finland,aaltoee.fi


In [70]:
final_K = em.extract_feature_vecs(final_C, 
                                feature_table=F, 
                                show_progress=False)  
final_K = em.impute_table(final_K, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX],
                strategy='mean')

In [72]:
em.save_table(final_K, path_to_csv_dir + 'final_K.pkl')
final_K.head()

Unnamed: 0,_id,ltable_a_id,rtable_person_id,a_name_a_name_jac_qgm_3_qgm_3,a_name_a_name_cos_dlm_dc0_dlm_dc0,a_name_a_name_jac_dlm_dc0_dlm_dc0,a_name_a_name_mel,a_name_a_name_lev_dist,a_name_a_name_lev_sim,a_name_a_name_nmw,...,a_prov_a_prov_jac_qgm_3_qgm_3,a_prov_a_prov_cos_dlm_dc0_dlm_dc0,a_prov_a_prov_jac_dlm_dc0_dlm_dc0,a_prov_a_prov_mel,a_prov_a_prov_lev_dist,a_prov_a_prov_lev_sim,a_prov_a_prov_nmw,a_prov_a_prov_sw,is_same_server,is_same_single_server
0,0,9,54111,1.0,1.0,1.0,1.0,0.0,1.0,18.0,...,0.893409,0.890104,0.887168,0.956123,1.037413,0.909672,7.301507,8.041462,0.0,0.0
1,1,9,54203,1.0,1.0,1.0,1.0,0.0,1.0,18.0,...,0.893409,0.890104,0.887168,0.956123,1.037413,0.909672,7.301507,8.041462,0.0,1.0
2,2,10,29724,1.0,1.0,1.0,1.0,0.0,1.0,16.0,...,0.893409,0.890104,0.887168,0.956123,1.037413,0.909672,7.301507,8.041462,1.0,1.0
3,3,10,14261,1.0,1.0,1.0,1.0,0.0,1.0,16.0,...,0.893409,0.890104,0.887168,0.956123,1.037413,0.909672,7.301507,8.041462,0.0,0.0
4,4,10,43727,1.0,1.0,1.0,1.0,0.0,1.0,16.0,...,0.893409,0.890104,0.887168,0.956123,1.037413,0.909672,7.301507,8.041462,1.0,1.0


In [78]:
# We have 37 features
final_K.shape  

(27887, 37)

In [80]:
import pickle

filename = 'rf.sav'
# loaded_model = pickle.load(open(filename, 'r'), encoding='utf-8')
loaded_model = pickle.load(open(filename, 'r'))
predictions = rf.predict(table=final_K, append=True, target_attr='predicted', inplace=False)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [None]:
df = predictions[predictions['predicted'] == 1]
df.to_csv('matched_tuples.csv', index=False)