In [232]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0


In [233]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [234]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_city,a_prov,a_country,a_email_server
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,academy management,briarcliff manor,new york,united states,aom
4,nor astern university,boston,massachusetts,united states,gmail
5,skidmore college,saratoga springs,new york,united states,skidmore


In [239]:
rtuple = aom_cleaner.data.xs(59256)
rtuple
from constants import domains
print(domains.COUNTRY)

['ac', 'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'az', 'ba', 'bb', 'be', 'bf', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bt', 'by', 'bz', 'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ck', 'cl', 'cm', 'cn', 'co', 'cr', 'cu', 'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'do', 'dz', 'ec', 'ee', 'eg', 'es', 'fi', 'fj', 'fk', 'fm', 'fr', 'fo', 'gb', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gr', 'gs', 'gt', 'gu', 'hk', 'hm', 'hn', 'hr', 'hu', 'id', 'ie', 'il', 'im', 'in', 'int', 'io', 'ir', 'is', 'it', 'je', 'jo', 'jp', 'ke', 'kg', 'kh', 'kr', 'kw', 'ky', 'kz', 'lb', 'lc', 'li', 'lk', 'lr', 'lt', 'lu', 'lv', 'ly', 'mc', 'md', 'mg', 'mh', 'mk', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mx', 'my', 'mw', 'na', 'nc', 'nf', 'ni', 'nl', 'no', 'np', 'nu', 'nz', 'om', 'pa', 'pe', 'pg', 'ph', 'pk', 'qa', 're', 'ro', 'ru', 'rw', 'sa', 'sb', 'se', 'sg', 'sh', 'si', 'sk', 'sm', 'sn', 'so', 'st', 'su', 'sv', 'sz', 'tc', 'td', 'tf', 'th', 'tj', 'tm', 'tn', 't

In [44]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')
whed_cleaner.clean_states('a_prov', 'a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_country,a_city,a_prov,a_web
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,pampanga state agricultural university,philippines,magalang,pampanga,http://www.pac.edu.ph
4,les roches international school hotel management,switzerland,bluche crans montana,bluche-crans-montana,http://www.lesroches.edu
6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu


# STEP 2 - MAGELLAN - BLOCKING

In [98]:

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
print(em.get_key(AOM))

WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)
print(em.get_key(WHED))


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


person_id
a_id


In [99]:
# building inverted index based on B
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=3000, y_param=10)
print(len(sample_WHED))
print(len(sample_AOM))
em.show_properties(sample_WHED)
em.show_properties(sample_AOM)

0%                          100%
[##############################] | ETA: 00:00:18 | ETA: 00:00:20 | ETA: 00:00:19 | ETA: 00:00:18 | ETA: 00:00:18 | ETA: 00:00:17 | ETA: 00:00:17 | ETA: 00:00:16 | ETA: 00:00:16 | ETA: 00:00:15 | ETA: 00:00:15 | ETA: 00:00:14 | ETA: 00:00:13 | ETA: 00:00:13 | ETA: 00:00:12 | ETA: 00:00:11 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00

11177
3000
id: 4752893656
key: a_id
id: 4752895000
key: person_id



Total time elapsed: 00:00:24


In [100]:
sample_WHED.head(n=5)

Unnamed: 0,a_id,a_name,a_country,a_city,a_prov,a_web
1,4,les roches international school hotel management,switzerland,bluche crans montana,bluche-crans-montana,http://www.lesroches.edu
2,6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu
3,7,a. d. patel institute technology,india,vitthal udyognagar,gujarat,http://www.adit.ac.in
4,8,a.t. still university,united states,kirksville,missouri,http://www.atsu.edu
5,9,aalborg university,denmark,aalborg,,http://www.aau.dk


In [101]:
sample_AOM.head(n=5)

Unnamed: 0,person_id,a_name,a_city,a_prov,a_country,a_email_server
5915,44473,wu vienna,vienna,,austria,wu
3514,24339,george washington university,washington,district of columbia,united states,gwu
974,4554,sheldon b. lubar school business,milwaukee,wisconsin,united states,uwm
3904,28014,erasmus university rotterdam,rotterdam,,netherlands,rsm
4076,29488,university st. gallen,sankt gallen,,switzerland,unisg


In [175]:
def match_country(ltuple, rtuple):
    l_country = ltuple['a_country']
    r_country = rtuple['a_country']
    if (l_country == "") or (r_country == ""):
        return None
    else:
        return ( l_country == r_country)

def match_country_us(ltuple, rtuple):
    return (rtuple['a_country'] == "united states")

def match_prov(ltuple, rtuple):
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    if (l_prov == "") or (r_prov == ""):
        return None
    else:
        return (l_prov == r_prov)

def match_city(ltuple, rtuple):
    l_city = ltuple['a_city']
    r_city = rtuple['a_city']
    if (l_city  == "") or (r_city == ""):
        return None
    else:
        return (l_city == r_city)

def match_provcity(ltuple, rtuple):
    l_city = ltuple['a_city']
    r_city = rtuple['a_city']
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    if (l_city  == "") or (r_city == "") or (l_prov == "") or (r_prov == ""):
        return None
    else:
        return (str(l_city) == str(r_prov)) or (str(l_prov) == str(r_city))

def match_domain(ltuple, rtuple):
    l_web = str(ltuple['a_web'])
    r_email = str(rtuple['a_email_server'])
    if (l_web == "") or (r_email == ""):
        return None
    else:
        l_web = l_web.split('.')
        return (r_email in l_web)

def match_overlap(ltuple, rtuple):
    l_name = str(ltuple['a_name'])
    r_name = str(rtuple['a_name'])
    
    if (l_name == "") or (r_name == ""):
        return None
    else:
        if l_name.count(" ") > 0 and r_name.count(" ") > 0:
            l_name = re.sub(r"(university|school|institute|college)","",l_name)
            r_name = re.sub(r"(university|school|institute|college)","",r_name)    
            l_tokens = em.tok_wspace(l_name)
            r_tokens = em.tok_wspace(r_name)
            return em.overlap_coeff(l_tokens, r_tokens) > 0.5
        else:
            return None
    
def allFalse(array):
    num_none = array.count(None)
    num_false = array.count(False)    
    return (num_none + num_false) == len(array) and num_none != len(array)
        
    
def match_combined(ltuple, rtuple):
    if match_country(ltuple, rtuple) is False:
        return True
    elif (match_country_us(ltuple,rtuple) is True) and (match_prov(ltuple,rtuple) is False):
        return True
    elif (match_country_us(ltuple,rtuple) is True) and (allFalse([match_overlap(ltuple, rtuple), 
                                                                   match_domain(ltuple, rtuple)]) is True):
        return True
    elif (match_country_us(ltuple,rtuple) is False) and allFalse([match_overlap(ltuple, rtuple), 
                                                                   match_domain(ltuple, rtuple)]) is True:
        return True
    else:
        return False
    
def blocking(A, B):
    bb = em.BlackBoxBlocker()
    bb.set_black_box_function(match_combined)
    C = bb.block_tables(A, B, l_output_attrs=['a_name','a_country','a_city','a_prov','a_web'], r_output_attrs=['a_name','a_country','a_city','a_prov','a_email_server'] )
    return C

C = blocking(sample_WHED, sample_AOM)


0%                          100%
[##############################] | ETA: 00:14:17 | ETA: 00:14:53 | ETA: 00:13:58 | ETA: 00:13:56 | ETA: 00:13:13 | ETA: 00:12:33 | ETA: 00:11:57 | ETA: 00:11:30 | ETA: 00:11:00 | ETA: 00:10:19 | ETA: 00:09:41 | ETA: 00:09:10 | ETA: 00:08:40 | ETA: 00:08:10 | ETA: 00:07:40 | ETA: 00:07:13 | ETA: 00:06:43 | ETA: 00:06:12 | ETA: 00:05:44 | ETA: 00:12:03 | ETA: 00:10:55 | ETA: 00:09:33 | ETA: 00:08:10 | ETA: 00:06:52 | ETA: 00:05:38 | ETA: 00:04:26 | ETA: 00:03:16 | ETA: 00:02:08 | ETA: 00:01:03 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:31:04


In [176]:
C.size


388570

In [177]:
C.head(n=1000)

Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_country,ltable_a_city,ltable_a_prov,ltable_a_web,rtable_a_name,rtable_a_country,rtable_a_city,rtable_a_prov,rtable_a_email_server
0,0,7,57798,a. d. patel institute technology,india,vitthal udyognagar,gujarat,http://www.adit.ac.in,indian institute technology delhi,india,new delhi,,iitd
1,1,10,17806,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,aalto
2,2,10,54466,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
3,3,10,58872,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
4,4,10,1239,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
5,5,10,15158,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,aalto,aalto
6,6,10,58547,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
7,7,10,50927,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
8,8,10,59274,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
9,9,10,54470,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto


In [230]:
rtuple = aom_cleaner.data.xs(59256)
rtuple

a_name            abubakar tafawa balewa university
a_city                                       bauchi
a_prov                                             
a_country                                   nigeria
a_email_server                                   ng
Name: 59256, dtype: object

In [158]:
ltuple = whed_cleaner.data.xs(676)
ltuple

a_name       academy fine arts vienna
a_country                     austria
a_city                           wien
a_prov                         vienna
a_web         http://www.akbild.ac.at
Name: 676, dtype: object

In [226]:
l_tokens = ['a.','d.','patel','technology']
r_tokens = ['indian','technology','delhi']
em.overlap_coeff(l_tokens, r_tokens)

0.3333333333333333