In [1]:
from owlready2 import onto_path, get_ontology
import os
import itertools
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup 

In [2]:
def read_ontology(path):
    onto = get_ontology(path)
    onto.load()

    # Read classes
    classes = []

    for cl in onto.classes():
        classes.append(cl)

    classes = list(set(classes))

    # Read properties
    properties = []
    
    for prop in onto.properties():
        properties.append(prop)
        
    properties = list(set(properties))
    
    return classes, properties

def get_mappings(path_to_rdf):
    tree = ET.parse(path_to_rdf)
    root = tree.getroot()

    for alignment in root:
        mappings = []
        for tags in alignment:
            #print child.tag
            match = []
            if 'map' in tags.tag:
                for cell in tags:
                    for entity in cell:
                        if 'entity' in entity.tag:
                            match.append(list(entity.attrib.values())[0].split('#')[1])
                mappings.append(match)
                
    return mappings


def get_mappings(filename):
    #filename = 'dataset1/alignments/101-301.rdf'
    mappings = []

    with open(filename) as f:
        soup = BeautifulSoup(f, 'xml')

    cells = soup.find_all('Cell')

    for cell in cells:
        entity1 = cell.find('entity1').attrs['rdf:resource'].split('#')[1]
        entity2 = cell.find('entity2').attrs['rdf:resource'].split('#')[1]
        mappings.append((entity1, entity2))
        
    return mappings

def get_dataset_from_alignment(ont1_path, ont2_path, alignment_path):

    data = []

    mappings = get_mappings(alignment_path)
    mappings = [tuple(x) for x in mappings]
    print(len(mappings))

    # Parse ontologies
    onto_classes1, object_properties1, data_properties1 = read_ontology(ont1_path)
    onto_classes2, object_properties2, data_properties2 = read_ontology(ont2_path)

    # Generate pairs of classes 
    class_pairs = list(itertools.product(onto_classes1, onto_classes2))

    for pair in tqdm(class_pairs):
        if pair in mappings:
            data.append((ont1_path, ont2_path, pair[0], pair[1], 1, 'Class'))
            mappings.remove(pair)
        else:
            data.append((ont1_path, ont2_path, pair[0], pair[1], 0, 'Class'))

    # Generate pairs of object properties
    object_properties_pairs = list(itertools.product(object_properties1, object_properties2))

    for pair in tqdm(object_properties_pairs):
        if pair in mappings:
            data.append((
                ont1_path, ont2_path, pair[0], pair[1], 1, 'ObjectProperty'))
            mappings.remove(pair)
        else:
            data.append((ont1_path, ont2_path, pair[0], pair[1], 0, 'ObjectProperty'))

    # Generate pairs of data properties
    data_properties_pairs = list(itertools.product(data_properties1, data_properties2))

    for pair in tqdm(data_properties_pairs):
        if pair in mappings:
            data.append((ont1_path, ont2_path, pair[0], pair[1], 1, 'DataProperty'))
            mappings.remove(pair)
        else:
            data.append((ont1_path, ont2_path, pair[0], pair[1], 0, 'DataProperty'))

    dataset = pd.DataFrame(data, columns=['Ontology1', 'Ontology2', 'Entity1', 'Entity2', 'Match', 'Type'])
    
    return dataset

def get_dataset(ont1_path, ont2_path, alignment_path):
    data = []

    #alignment_path = 'dataset1/alignments/101-301.rdf'
    #ont1_path = 'dataset1/ontologies/101.rdf'
    #ont2_path = 'dataset1/ontologies/301.rdf'

    mappings = get_mappings(alignment_path)
    mappings = [tuple(x) for x in mappings]
    print('Number of mappings', len(mappings))

    all_mappings = []

    # Parse ontologies
    classes1, properties1 = read_ontology(ont1_path)
    classes2, properties2 = read_ontology(ont2_path)

    # Generate pairs of classes 
    class_pairs = list(itertools.product(classes1, classes2))

    for class_pair in tqdm(class_pairs):
        pair = (class_pair[0].name, class_pair[1].name)
        if pair in mappings:
            match = 1
            all_mappings.append(pair)
            mappings.remove(pair)
        else:
            match = 0

        data.append((ont1_path, ont2_path, pair[0], pair[1], class_pair[0].is_a[0].name, class_pair[1].is_a[0].name, 
                     get_path(class_pair[0]), get_path(class_pair[1]),match, 'Class'))

    # Generate pairs of properties
    properties_pairs = list(itertools.product(properties1, properties2))

    for prop_pair in tqdm(properties_pairs):
        pair = (prop_pair[0].name, prop_pair[1].name)
        if pair in mappings:
            match = 1
            all_mappings.append(pair)
            mappings.remove(pair)
        else:
            match = 0

        data.append((ont1_path, ont2_path, pair[0], pair[1], class_pair[0].is_a[0].name, class_pair[1].is_a[0].name,
                     get_path(class_pair[0]), get_path(class_pair[1]), match, 'Property'))

    print('Readed mappings', len(all_mappings), '\n')

    dataset = pd.DataFrame(data, columns=['Ontology1', 'Ontology2', 'Entity1', 'Entity2', 'Parent1', 'Parent2',
                                          'Path1', 'Path2', 'Match', 'Type'])

    return dataset

def get_path(cl):
    path = cl.name
    while True:
        try:
            path = path + '/' + cl.is_a[0].name
        except IndexError:
            break
        cl = cl.is_a[0]
        if cl == 'owl.Thing':
            break
    
    return '/'.join(path.split('/')[::-1])

In [20]:
PATH_TO_ONTOLOGIES = 'dataset1/ontologies/'
#PATH_TO_ONTOLOGIES = 'dataset2/ontologies/'
ONTOLOGIES = ['101.rdf', '102.rdf', '103.rdf', '301.rdf', '302.rdf', '303.rdf', '304.rdf']
#ONTOLOGIES = ['confof.owl', 'edas.owl', 'conference.owl', 'myreview.owl', 'sigkdd.owl', 'linklings.owl', 'pcs.owl',
#              'paperdyne.owl', 'openconf.owl', '205.rdf', 'cmt.owl', '204.rdf', 'iasted.owl', '238.rdf',
#              'crs_dr.owl', 'micro.owl', 'ekaw.owl','confious.owl', '301.rdf', '101.rdf', 'cocus.owl', '304.rdf']

PATH_TO_ALIGNMENTS = 'dataset1/alignments/'
#PATH_TO_ALIGNMENTS = 'dataset2/alignments/'
#TRAIN_ALIGNMENTS = ['cmt-conference.rdf', 'conference-iasted.rdf', 'edas-ekaw.rdf', '101-204.rdf', '101-205.rdf',
#                   '101-238.rdf', '101-301.rdf', '101-304.rdf']

#TEST_ALIGNMENTS = ['conference-edas.rdf', 'cmt-sigkdd.rdf', 'edas-sigkdd.rdf', 'ekaw-sigkdd.rdf',
#                   'cmt-edas.rdf', 'conference-sigkdd.rdf', 'confof-edas.rdf', 'confof-iasted.rdf',
#                   'conference-confof.rdf', 'cmt-confof.rdf', 'conference-ekaw.rdf', 'cmt-ekaw.rdf',
#                   'confof-ekaw.rdf', 'iasted-sigkdd.rdf', 'cmt-iasted.rdf', 'edas-iasted.rdf', 'ekaw-iasted.rdf',
#                   'confof-sigkdd.rdf']
#TRAIN_ALIGNMENTS = [('101', '301'), ('101', '102'), ('101', '103')]
#TEST_ALIGNMENTS = [('101', '302'), ('101', '303'), ('101', '304')]

TRAIN_ALIGNMENTS = ['101-301.rdf', '101-102.rdf', '101-103.rdf']
TEST_ALIGNMENTS = ['101-302.rdf', '101-303.rdf', '101-304.rdf']

In [21]:
datasets = []

for align_name in TRAIN_ALIGNMENTS:
    print('Read from', align_name)
    ont1, ont2 = align_name.split('.')[0].split('-')
    if '101' in align_name:
        ont1_path = PATH_TO_ONTOLOGIES + ont1 + '.rdf'
        ont2_path = PATH_TO_ONTOLOGIES + ont2 + '.rdf'
    else:
        ont1_path = PATH_TO_ONTOLOGIES + ont1 + '.owl'
        ont2_path = PATH_TO_ONTOLOGIES + ont2 + '.owl'
    alignment_path = PATH_TO_ALIGNMENTS + align_name
    
    datasets.append(get_dataset(ont1_path, ont2_path, alignment_path))
    
train = pd.concat(datasets, ignore_index = True)

datasets = []

for align_name in TEST_ALIGNMENTS:
    print('Read from', align_name)
    ont1, ont2 = align_name.split('.')[0].split('-')
    if '101' in align_name:
        ont1_path = PATH_TO_ONTOLOGIES + ont1 + '.rdf'
        ont2_path = PATH_TO_ONTOLOGIES + ont2 + '.rdf'
    else:
        ont1_path = PATH_TO_ONTOLOGIES + ont1 + '.owl'
        ont2_path = PATH_TO_ONTOLOGIES + ont2 + '.owl'
    alignment_path = PATH_TO_ALIGNMENTS + align_name
    
    datasets.append(get_dataset(ont1_path, ont2_path, alignment_path))
    
test = pd.concat(datasets, ignore_index = True)

100%|██████████| 540/540 [00:00<00:00, 41957.80it/s]
100%|██████████| 3528/3528 [00:00<00:00, 43501.86it/s]
100%|██████████| 2664/2664 [00:00<00:00, 79193.90it/s]

Read from 101-301.rdf
Number of mappings 59
Readed mappings 59 

Read from 101-102.rdf
Number of mappings 0



100%|██████████| 936/936 [00:00<00:00, 55637.94it/s]
100%|██████████| 1296/1296 [00:00<00:00, 47694.33it/s]
100%|██████████| 5184/5184 [00:00<00:00, 55044.17it/s]


Readed mappings 0 

Read from 101-103.rdf
Number of mappings 97
Readed mappings 97 



100%|██████████| 468/468 [00:00<00:00, 72647.46it/s]
100%|██████████| 2160/2160 [00:00<00:00, 45625.39it/s]
100%|██████████| 2016/2016 [00:00<00:00, 38893.49it/s]
  0%|          | 0/5184 [00:00<?, ?it/s]

Read from 101-302.rdf
Number of mappings 48
Readed mappings 48 

Read from 101-303.rdf
Number of mappings 48


100%|██████████| 5184/5184 [00:00<00:00, 43986.58it/s]
100%|██████████| 1440/1440 [00:00<00:00, 35261.89it/s]
  0%|          | 0/3672 [00:00<?, ?it/s]

Readed mappings 48 

Read from 101-304.rdf
Number of mappings 76


100%|██████████| 3672/3672 [00:00<00:00, 35171.40it/s]

Readed mappings 76 






In [15]:
# N-grams similarity
import ngram
import editdistance
from fuzzycomp import fuzzycomp
from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
from py_stringmatching.similarity_measure.affine import Affine
from py_stringmatching.similarity_measure.bag_distance import BagDistance
from py_stringmatching.similarity_measure.cosine import Cosine
from py_stringmatching.similarity_measure.partial_ratio import PartialRatio
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
from py_stringmatching.similarity_measure.editex import Editex
from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
from py_stringmatching.similarity_measure.dice import Dice
from py_stringmatching.similarity_measure.jaccard import Jaccard
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
from py_stringmatching.similarity_measure.partial_token_sort import PartialTokenSort
from py_stringmatching.similarity_measure.ratio import Ratio
from py_stringmatching.similarity_measure.soundex import Soundex
from py_stringmatching.similarity_measure.tfidf import TfIdf
from py_stringmatching.similarity_measure.token_sort import TokenSort
from py_stringmatching.similarity_measure.tversky_index import TverskyIndex
from re import finditer
from nltk.corpus import wordnet
from itertools import product
from gensim.models import KeyedVectors

af = Affine()
me = MongeElkan()
nw = NeedlemanWunsch()
sw = SmithWaterman()
bd = BagDistance()
cos = Cosine()
pr = PartialRatio()
sf = SoftTfIdf()
edx = Editex()
gj = GeneralizedJaccard()
jw = JaroWinkler()
lev = Levenshtein()
dice = Dice()
jac = Jaccard()
jaro = Jaro()
pts = PartialTokenSort()
rat = Ratio()
sound = Soundex()
tfidf = TfIdf()
ts = TokenSort()
tv_ind = TverskyIndex()
over_coef = OverlapCoefficient()

def camel_case_split(identifier):
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

def get_word2vec_sim(row_set1, row_set2):
    sum_sim = 0
    N = max(len(row_set1), len(row_set2))

    for w1 in row_set1:
        maxSim = 0
        for w2 in row_set2:
            try:
                sim = model.wv.similarity(w1, w2)
            except:
                sim = 0

            if sim > maxSim:
                maxSim = sim
        sum_sim = sum_sim + maxSim

    sum_sim = sum_sim / N
    
    return sum_sim


def get_words(text):
    if '_' in text:
        row_set = text.split('_')
    else:
        if '-' in text:
            row_set = text.split('-')
        else:
            row_set = camel_case_split(text)
    
    row_set = [x.lower() for x in row_set]
    return row_set

# It's long
model = KeyedVectors.load_word2vec_format('/Users/leo/PycharmProjects/machine-learning-ontology-matching/GoogleNews-vectors-negative300.bin', binary=True)

In [17]:
#train.to_csv('dataset1_train.csv', index = False)
#test.to_csv('dataset1_test.csv', index = False)


train.to_csv('dataset1_train_reiterate.csv', index = False)
test.to_csv('dataset1_test_reiterate.csv', index = False)


In [22]:
SELECTED_DATASET = 'dataset1'

# Calculate features for training dataset
data = pd.read_csv(SELECTED_DATASET + '_train_reiterate.csv')

data = calculate_features(data, 'Entity')
data = calculate_features(data, 'Parent')
data = calculate_features(data, 'Path')

data.to_csv(SELECTED_DATASET + '_train_features_reiterate.csv', index=False)

# Calculate features for testing dataset
data = pd.read_csv(SELECTED_DATASET + '_test_reiterate.csv')

data = calculate_features(data, 'Entity')
data = calculate_features(data, 'Parent')
data = calculate_features(data, 'Path')

data.to_csv(SELECTED_DATASET + '_test_features_reiterate.csv', index=False)

14148it [02:38, 89.36it/s] 
14148it [02:33, 91.89it/s] 
14148it [02:37, 89.89it/s]
14940it [02:34, 96.97it/s] 
14940it [02:19, 107.03it/s]
14940it [03:41, 67.43it/s] 


In [8]:
test

Unnamed: 0,Ontology1,Ontology2,Entity1,Entity2,Parent1,Parent2,Path1,Path2,Match,Type
0,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,PersonalPublicationHistory,Reviewed_contribution,PersonalHistory,Thing/Conference_document/Conference_contribut...,Thing/PersonalHistory/PersonalPublicationHistory,0,Class
1,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,MeetingRoomPlace,Reviewed_contribution,Place,Thing/Conference_document/Conference_contribut...,Thing/Place/MeetingRoomPlace,0,Class
2,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Presenter,Reviewed_contribution,Author,Thing/Conference_document/Conference_contribut...,Thing/Author/Presenter,0,Class
3,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Programme,Reviewed_contribution,Document,Thing/Conference_document/Conference_contribut...,Thing/Document/Programme,0,Class
4,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,SecurityTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/SecurityTopic,0,Class
5,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,ComputerNetworksOpticalTopic,Reviewed_contribution,ComputerNetworksTopic,Thing/Conference_document/Conference_contribut...,Thing/Topic/ComputerNetworksTopic/ComputerNetw...,0,Class
6,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Conference,Reviewed_contribution,Thing,Thing/Conference_document/Conference_contribut...,Thing/Conference,0,Class
7,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,CoffeeBreak,Reviewed_contribution,BreakEvent,Thing/Conference_document/Conference_contribut...,Thing/ConferenceEvent/NonAcademicEvent/BreakEv...,0,Class
8,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WeekRejectRating,Reviewed_contribution,ReviewRating,Thing/Conference_document/Conference_contribut...,Thing/ReviewRating/WeekRejectRating,0,Class
9,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WirelessCommunicationsTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/WirelessCommunicationsTopic,0,Class


In [18]:
def calculate_features(dataset, string_type): 
    
    ngrams1 = []
    ngrams2 = []
    ngrams3 = []
    ngrams4 = []
    dices = []
    jaccards = []
    jaros = []
    lcs = []
    mes = []
    sws = []
    afs = []
    bds = []
    coses = []
    prs = []
    sfs = []
    edxs = []
    gjs = []
    jws = []
    lws = []
    ptss = []
    rats = []
    sounds = []
    tfidfs = []
    tss = []
    tvs = []
    ovs = []
    nws = []
    wordnet_sims = []
    w2vec_sims = []
    
    if string_type == 'Entity':
        index = 2
    elif string_type == 'Parent':
        index = 4
    elif string_type == 'Path':
        index = 6
    
    for key, row in tqdm(dataset.iterrows()):
            
        string1 = row[index]
        string2 = row[index + 1]
            
        ngrams1.append(ngram.NGram.compare(string1, string2, N=1))
        ngrams2.append(ngram.NGram.compare(string1, string2, N=2))
        ngrams3.append(ngram.NGram.compare(string1, string2, N=3))
        ngrams4.append(ngram.NGram.compare(string1, string2, N=4))
        lws.append(lev.get_sim_score(string1, string2))
        jaros.append(jaro.get_sim_score(string1, string2))
        lcs.append(2 * fuzzycomp.lcs_length(string1, string2) / (len(string1) + len(string2)))
        nws.append(nw.get_raw_score(string1, string2))
        sws.append(sw.get_raw_score(string1, string2))
        afs.append(af.get_raw_score(string1, string2))
        bds.append(bd.get_sim_score(string1, string2))
        prs.append(pr.get_sim_score(string1, string2))
        edxs.append(edx.get_sim_score(string1, string2))
        ptss.append(pts.get_sim_score(string1, string2))
        rats.append(rat.get_sim_score(string1, string2))
        sounds.append(sound.get_sim_score(string1, string2))
        tss.append(ts.get_sim_score(string1, string2))
        jws.append(jw.get_sim_score(string1, string2))

        row_set1 = get_words(string1)
        row_set2 = get_words(string2)

        mes.append(me.get_raw_score(row_set1, row_set2))
        coses.append(cos.get_sim_score(row_set1, row_set2))
        sfs.append(sf.get_raw_score(row_set1, row_set2))
        gjs.append(gj.get_sim_score(row_set1, row_set2))
        tfidfs.append(tfidf.get_sim_score(row_set1, row_set2))
        tvs.append(tv_ind.get_sim_score(row_set1, row_set2))
        ovs.append(over_coef.get_sim_score(row_set1, row_set2))
        dices.append(dice.get_sim_score(row_set1, row_set2))
        jaccards.append(jac.get_sim_score(row_set1, row_set2))

        allsyns1 = set(ss for word in row_set1 for ss in wordnet.synsets(word))
        allsyns2 = set(ss for word in row_set2 for ss in wordnet.synsets(word))

        best = [wordnet.wup_similarity(s1, s2) for s1, s2 in product(allsyns1, allsyns2)]
        if len(best) > 0:
            wordnet_sims.append(best[0])
        else:
            wordnet_sims.append(0)

        w2vec_sims.append(get_word2vec_sim(row_set1, row_set2))
    
    dataset['Ngram1' + '_' + string_type] = ngrams1
    dataset['Ngram2' + '_' + string_type] = ngrams2
    dataset['Ngram3' + '_' + string_type] = ngrams3
    dataset['Ngram4' + '_' + string_type] = ngrams4
    dataset['Dice' + '_' + string_type] = dices
    dataset['Jaccard' + '_' + string_type] = jaccards
    dataset['Jaro' + '_' + string_type] = jaros
    dataset['Longest_com_sub' + '_' + string_type] = lcs
    dataset['Monge-Elkan' + '_' + string_type] = mes
    dataset['SmithWaterman' + '_' + string_type] = sws
    dataset['AffineGap' + '_' + string_type] = afs
    dataset['BagDistance' + '_' + string_type] = bds
    dataset['Cosine_similarity' + '_' + string_type] = coses
    dataset['PartialRatio' + '_' + string_type] = prs
    dataset['Soft_TFIDF' + '_' + string_type] = sfs
    dataset['Editex' + '_' + string_type] = edxs
    dataset['GeneralizedJaccard' + '_' + string_type] = gjs
    dataset['JaroWinkler' + '_' + string_type] = jws
    dataset['Levenshtein' + '_' + string_type] = lws
    dataset['PartialTokenSort' + '_' + string_type] = ptss
    dataset['Ratio' + '_' + string_type] = rats
    dataset['Soundex' + '_' + string_type] = sounds
    dataset['TFIDF' + '_' + string_type] = tfidfs
    dataset['TokenSort' + '_' + string_type] = tss
    dataset['TverskyIndex' + '_' + string_type] = tvs
    dataset['OverlapCoef' + '_' + string_type] = ovs
    dataset['Needleman-Wunsch' + '_' + string_type] = nws
    dataset['Wordnet_sim' + '_' + string_type] = wordnet_sims
    dataset['Word2vec_sim' + '_' + string_type] = w2vec_sims
    
    return dataset

In [584]:
from py_stringmatching.similarity_measure.levenshtein import Levenshtein

lev = Levenshtein()
lws_train = []
lws_test = []

for key, row in tqdm(train.iterrows()):
    lws_train.append(lev.get_sim_score(row[2], row[3]))
    
for key, row in tqdm(test.iterrows()):
    lws_test.append(lev.get_sim_score(row[2], row[3]))

14148it [00:02, 6924.56it/s]
14940it [00:01, 8768.56it/s]


In [585]:
predicts = []

train['Levenshtein'] = lws_train
test['Levenshtein'] = lws_test

for x in lws_test:
    pred = 0
    
    if x >= 0.5:
        pred = 1
        
    predicts.append(pred)
    
test['Predict'] = predicts

In [68]:
test['Predict'] = predictions

In [73]:
pred_mappings = test[(test['Ontology1'] == "dataset1/ontologies/101.rdf") &
     (test['Ontology2'] == "dataset1/ontologies/302.rdf") &
     (test['Predict'] == 1)]

true_mappings = test[(test['Ontology1'] == "dataset1/ontologies/101.rdf") &
     (test['Ontology2'] == "dataset1/ontologies/302.rdf") &
     (test['Match'] == 1)]

correct_mappings = test[(test['Ontology1'] == "dataset1/ontologies/101.rdf") &
     (test['Ontology2'] == "dataset1/ontologies/302.rdf") &
     (test['Match'] == 1) & (test['Predict'] == 1)]

In [83]:
pred_mappings

Unnamed: 0,Ontology1,Ontology2,Entity1,Entity2,Parent1,Parent2,Path1,Path2,Match,Type,Predict
2642,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Collection,InCollection,Book,Publication,Thing/Reference/Book/Collection,Thing/Publication/InCollection,0,Class,1
2729,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Person,Person,Thing,Thing,Thing/Person,Thing/Person,0,Class,1
2869,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Book,InBook,Reference,Publication,Thing/Reference/Book,Thing/Publication/InBook,0,Class,1
2870,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Book,Book,Reference,Publication,Thing/Reference/Book,Thing/Publication/Book,1,Class,1
3017,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Booklet,Booklet,Informal,Publication,Thing/Reference/Informal/Booklet,Thing/Publication/Booklet,1,Class,1
3176,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Misc,Misc,Reference,Publication,Thing/Reference/Misc,Thing/Publication/Misc,1,Class,1
3202,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,InCollection,InCollection,Part,Publication,Thing/Reference/Part/InCollection,Thing/Publication/InCollection,1,Class,1
3380,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,MastersThesis,MasterThesis,Academic,Thesis,Thing/Reference/Academic/MastersThesis,Thing/Publication/Thesis/MasterThesis,1,Class,1
3448,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Article,Article,Part,Publication,Thing/Reference/Part/Article,Thing/Publication/Article,1,Class,1
3620,dataset1/ontologies/101.rdf,dataset1/ontologies/303.rdf,Academic,AcademicStaff,Reference,Employee,Thing/Reference/Academic,Thing/Person/Employee/AcademicStaff,0,Class,1


In [563]:
# Write to Alignment API Format
file = open("101-302-test.rdf","w") 
 
file.write("<?xml version='1.0' encoding='utf-8' standalone='no'?> \n") 
           
file.write("<rdf:RDF xmlns='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#\' \n xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\' \n xmlns:xsd=\'http://www.w3.org/2001/XMLSchema#\' \n xmlns:align=\'http://knowledgeweb.semanticweb.org/heterogeneity/alignment#\'> \n")

file.write("<Alignment>\n")

file.write("\t<xml>yes</xml>\n")
file.write("\t<level>0</level>\n")
file.write("\t<type>11</type>\n")

for key, row in tqdm(mappings.iterrows()):
    file.write("\t<map>\n")
    file.write("\t\t<Cell>\n")
    file.write("\t\t\t<entity1 rdf:resource=\'http://oaei.ontologymatching.org/tests/101/onto.rdf#" + row[2] + "\'/>\n")
    file.write("\t\t\t<entity2 rdf:resource=\'http://ebiquity.umbc.edu/v2.1/ontology/publication.owl#"+ row[3] +"\'/>\n")
    file.write("\t\t\t<measure rdf:datatype=\'http://www.w3.org/2001/XMLSchema#float\'>1.0</measure>\n")
    file.write("\t\t\t<relation>=</relation>\n")
    file.write("\t\t</Cell>\n")
    file.write("\t</map>\n")
    
file.write("</Alignment>\n")
file.write("</rdf:RDF>\n")
 
file.close() 

69it [00:00, 3814.91it/s]


In [72]:
predictions

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [74]:
true_num = len(true_mappings)
predict_num = len(pred_mappings)
correct_num = len(correct_mappings)

precision = correct_num / predict_num
recall = correct_num / true_num
fmeasure = 2 * precision * recall / (precision + recall)

In [75]:
print(precision, recall, fmeasure)

0.6190476190476191 0.8125 0.7027027027027026


In [55]:
train_features = pd.read_csv('dataset1_train_features.csv')
test_features = pd.read_csv('dataset1_test_features.csv')

train_types = []

for row in train_features['Type']:
    if row == 'Class':
        train_types.append(1)
    else:
        train_types.append(0)
        
train_features['Type_encode'] = train_types

test_types = []

for row in test_features['Type']:
    if row == 'Class':
        test_types.append(1)
    else:
        test_types.append(0)
        
test_features['Type_encode'] = test_types

X_train = train_features.loc[:, 'Ngram1_Entity':'Type_encode']
y_train = train_features['Match']

X_test = test_features.loc[:, 'Ngram1_Entity':'Type_encode']
y_test = test_features['Match']

df_train = train_features.loc[:, 'Ngram1_Entity':'Type_encode']
df_train['Match'] = train_features['Match']

df_test = test_features.loc[:, 'Ngram1_Entity':'Type_encode']
df_test['Match'] = test_features['Match']


Unnamed: 0,Ngram1_Entity,Ngram2_Entity,Ngram3_Entity,Ngram4_Entity,Dice_Entity,Jaccard_Entity,Jaro_Entity,Longest_com_sub_Entity,Monge-Elkan_Entity,SmithWaterman_Entity,...,Ratio_Path,Soundex_Path,TFIDF_Path,TokenSort_Path,TverskyIndex_Path,OverlapCoef_Path,Needleman-Wunsch_Path,Wordnet_sim_Path,Word2vec_sim_Path,Type_encode
0,0.166667,0.000000,0.000000,0.000000,0.0,0.0,0.450000,0.142857,0.450000,1.0,...,0.34,0,0.0,0.38,0.0,0.0,-6.0,0.0,0.0,1
1,0.416667,0.055556,0.000000,0.000000,0.0,0.0,0.657143,0.470588,0.657143,3.0,...,0.60,0,0.0,0.64,0.0,0.0,3.0,0.0,0.0,1
2,0.166667,0.000000,0.000000,0.000000,0.0,0.0,0.450000,0.285714,0.450000,1.0,...,0.51,0,0.0,0.55,0.0,0.0,-3.0,0.0,0.0,1
3,0.235294,0.000000,0.000000,0.000000,0.0,0.0,0.460606,0.190476,0.460606,1.0,...,0.26,0,0.0,0.30,0.0,0.0,1.0,0.0,0.0,1
4,0.416667,0.117647,0.000000,0.000000,0.0,0.0,0.604762,0.235294,0.604762,2.0,...,0.40,0,0.0,0.48,0.0,0.0,-2.0,0.0,0.0,1
5,0.153846,0.000000,0.000000,0.000000,0.0,0.0,0.433333,0.133333,0.533333,1.0,...,0.33,0,0.0,0.38,0.0,0.0,-12.0,0.0,0.0,1
6,0.250000,0.047619,0.000000,0.000000,0.0,0.0,0.516667,0.300000,0.488889,2.0,...,0.38,0,0.0,0.45,0.0,0.0,1.0,0.0,0.0,1
7,0.277778,0.000000,0.000000,0.000000,0.0,0.0,0.561538,0.347826,0.561538,2.0,...,0.46,0,0.0,0.36,0.0,0.0,4.0,0.0,0.0,1
8,0.150000,0.000000,0.000000,0.000000,0.0,0.0,0.399145,0.260870,0.399145,1.0,...,0.36,0,0.0,0.39,0.0,0.0,4.0,0.0,0.0,1
9,0.692308,0.600000,0.529412,0.473684,0.0,0.0,0.809259,0.818182,0.944444,9.0,...,0.65,0,0.0,0.62,0.0,0.0,10.0,0.0,0.0,1


In [87]:
from auto_ml import Predictor
from auto_ml.utils_models import load_ml_model

# Load data
#df_train, df_test = get_boston_dataset()

# Tell auto_ml which column is 'output'
# Also note columns that aren't purely numerical
# Examples include ['nlp', 'date', 'categorical', 'ignore']
column_descriptions = {
    'Match': 'output',
    'Type_encode': 'categorical'
}

models = ['AdaBoostClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'LogisticRegression',
          'PassiveAggressiveClassifier', 'Perceptron', 'RandomForestClassifier', 'RidgeClassifier',
          'SGDClassifier', 'DeepLearningClassifier', 'LGBMClassifier', 'XGBClassifier']

models = ['GradientBoostingClassifier']

ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

ml_predictor.train(df_train, optimize_final_model = True, model_names = models)

# Score the model on test data
test_score = ml_predictor.score(df_test, df_test.Match)

# auto_ml is specifically tuned for running in production
# It can get predictions on an individual row (passed in as a dictionary)
# A single prediction like this takes ~1 millisecond
# Here we will demonstrate saving the trained model, and loading it again
file_name = ml_predictor.save()

trained_model = load_ml_model(file_name)

# .predict and .predict_proba take in either:
# A pandas DataFrame
# A list of dictionaries
# A single dictionary (optimized for speed in production evironments)
predictions = trained_model.predict(df_test)
print(predictions)

Welcome to auto_ml! We're about to go through and make sense of your data using machine learning, and give you a production-ready pipeline to get predictions with.

If you have any issues, or new feature ideas, let us know at http://auto.ml
You are running on version 2.9.10
Now using the model training_params that you passed in:
{}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{'presort': False, 'learning_rate': 0.1, 'warm_start': True}
Running basic data cleaning
Fitting DataFrameVectorizer
Now using the model training_params that you passed in:
{}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{'presort': False, 'learning_rate': 0.1, 'warm_start': True}


ModuleNotFoundError: No module named 'multiprocessing.managers'

In [79]:
import sklearn

In [80]:
sklearn.ensemble.AdaBoostClassifier

sklearn.ensemble.weight_boosting.AdaBoostClassifier

In [93]:
from autosklearn.metrics import Scorer

ModuleNotFoundError: No module named 'autosklearn'

In [91]:
scorer.f1_score

<function sklearn.metrics.classification.f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)>

In [11]:
test

Unnamed: 0,Ontology1,Ontology2,Entity1,Entity2,Parent1,Parent2,Path1,Path2,Match,Type
0,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,PersonalPublicationHistory,Reviewed_contribution,PersonalHistory,Thing/Conference_document/Conference_contribut...,Thing/PersonalHistory/PersonalPublicationHistory,0,Class
1,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,MeetingRoomPlace,Reviewed_contribution,Place,Thing/Conference_document/Conference_contribut...,Thing/Place/MeetingRoomPlace,0,Class
2,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Presenter,Reviewed_contribution,Author,Thing/Conference_document/Conference_contribut...,Thing/Author/Presenter,0,Class
3,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Programme,Reviewed_contribution,Document,Thing/Conference_document/Conference_contribut...,Thing/Document/Programme,0,Class
4,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,SecurityTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/SecurityTopic,0,Class
5,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,ComputerNetworksOpticalTopic,Reviewed_contribution,ComputerNetworksTopic,Thing/Conference_document/Conference_contribut...,Thing/Topic/ComputerNetworksTopic/ComputerNetw...,0,Class
6,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Conference,Reviewed_contribution,Thing,Thing/Conference_document/Conference_contribut...,Thing/Conference,0,Class
7,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,CoffeeBreak,Reviewed_contribution,BreakEvent,Thing/Conference_document/Conference_contribut...,Thing/ConferenceEvent/NonAcademicEvent/BreakEv...,0,Class
8,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WeekRejectRating,Reviewed_contribution,ReviewRating,Thing/Conference_document/Conference_contribut...,Thing/ReviewRating/WeekRejectRating,0,Class
9,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WirelessCommunicationsTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/WirelessCommunicationsTopic,0,Class


In [13]:
onto = get_ontology('dataset1/ontologies/101.rdf')
onto.load()

get_ontology("http://oaei.ontologymatching.org/tests/101/onto.rdf#")

In [38]:
classes = []

for cl in onto.classes():
    classes.append(cl)
    print(cl.is_a[0])

owl.Thing
owl.Thing
owl.Thing
owl.Thing
onto.rdf#.Part
onto.rdf#.Reference
onto.rdf#.Book
onto.rdf#.Book
onto.rdf#.Reference
onto.rdf#.Informal
onto.rdf#.Reference
onto.rdf#.Part
onto.rdf#.Part
onto.rdf#.Part
onto.rdf#.Part
onto.rdf#.Informal
onto.rdf#.Informal
onto.rdf#.Reference
onto.rdf#.Academic
onto.rdf#.Academic
onto.rdf#.Reference
onto.rdf#.Book
onto.rdf#.Reference
onto.rdf#.Report
onto.rdf#.Report
onto.rdf#.Informal
onto.rdf#.Reference
owl.Thing
owl.Thing
owl.Thing
0.1.Organization
onto.rdf#.Institution
onto.rdf#.Institution
22-rdf-syntax-ns.List
22-rdf-syntax-ns.List
owl.Thing
owl.Thing
onto.rdf#.Reference
onto.rdf#.Informal
onto.rdf#.Reference
onto.rdf#.Informal
owl.Thing
owl.Thing
onto.rdf#.Book
onto.rdf#.Informal
owl.Thing
owl.Thing
onto.rdf#.Reference
onto.rdf#.Informal
onto.rdf#.Part
onto.rdf#.Report
onto.rdf#.Reference
owl.Thing
owl.Thing
owl.Thing
owl.Thing
owl.Thing
owl.Thing


In [22]:
cl.is_a[0]

owl.Thing

In [26]:
cl.get_properties(cl)

{rdf-schema.comment, rdf-schema.label}

In [35]:
prop = cl.is_instance_of

In [33]:
cl

onto.rdf#.Conference

In [42]:
test[test['Type'] == 'Class']

Unnamed: 0,Ontology1,Ontology2,Entity1,Entity2,Parent1,Parent2,Path1,Path2,Match,Type
0,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,PersonalPublicationHistory,Reviewed_contribution,PersonalHistory,Thing/Conference_document/Conference_contribut...,Thing/PersonalHistory/PersonalPublicationHistory,0,Class
1,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,MeetingRoomPlace,Reviewed_contribution,Place,Thing/Conference_document/Conference_contribut...,Thing/Place/MeetingRoomPlace,0,Class
2,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Presenter,Reviewed_contribution,Author,Thing/Conference_document/Conference_contribut...,Thing/Author/Presenter,0,Class
3,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Programme,Reviewed_contribution,Document,Thing/Conference_document/Conference_contribut...,Thing/Document/Programme,0,Class
4,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,SecurityTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/SecurityTopic,0,Class
5,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,ComputerNetworksOpticalTopic,Reviewed_contribution,ComputerNetworksTopic,Thing/Conference_document/Conference_contribut...,Thing/Topic/ComputerNetworksTopic/ComputerNetw...,0,Class
6,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,Conference,Reviewed_contribution,Thing,Thing/Conference_document/Conference_contribut...,Thing/Conference,0,Class
7,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,CoffeeBreak,Reviewed_contribution,BreakEvent,Thing/Conference_document/Conference_contribut...,Thing/ConferenceEvent/NonAcademicEvent/BreakEv...,0,Class
8,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WeekRejectRating,Reviewed_contribution,ReviewRating,Thing/Conference_document/Conference_contribut...,Thing/ReviewRating/WeekRejectRating,0,Class
9,dataset2/ontologies/conference.owl,dataset2/ontologies/edas.owl,Accepted_contribution,WirelessCommunicationsTopic,Reviewed_contribution,Topic,Thing/Conference_document/Conference_contribut...,Thing/Topic/WirelessCommunicationsTopic,0,Class


In [3]:
df = pd.read_csv('dataset1_test.csv')

In [5]:
df[df['Match'] == 1]

Unnamed: 0,Ontology1,Ontology2,Entity1,Entity2,Parent1,Parent2,Path1,Path2,Match,Type
9,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Collection,Book,Book,Publication,Thing/Reference/Book/Collection,Thing/Publication/Book,1,Class
47,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,TechReport,TechReport,Report,Publication,Thing/Reference/Report/TechReport,Thing/Publication/TechReport,1,Class
61,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Book,Book,Reference,Publication,Thing/Reference/Book,Thing/Publication/Book,1,Class
88,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Booklet,Publication,Informal,Resource,Thing/Reference/Informal/Booklet,Thing/Resource/Publication,1,Class
101,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,LectureNotes,Publication,Informal,Resource,Thing/Reference/Informal/LectureNotes,Thing/Resource/Publication,1,Class
122,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Misc,Misc,Reference,Publication,Thing/Reference/Misc,Thing/Publication/Misc,1,Class
136,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,InCollection,InCollection,Part,Publication,Thing/Reference/Part/InCollection,Thing/Publication/InCollection,1,Class
170,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,MastersThesis,MastersThesis,Academic,Publication,Thing/Reference/Academic/MastersThesis,Thing/Publication/MastersThesis,1,Class
194,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Article,Article,Part,Publication,Thing/Reference/Part/Article,Thing/Publication/Article,1,Class
202,dataset1/ontologies/101.rdf,dataset1/ontologies/302.rdf,Chapter,InBook,Part,Publication,Thing/Reference/Part/Chapter,Thing/Publication/InBook,1,Class
