In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import torch
import rdflib

import mowl
mowl.init_jvm('10g')
from mowl.datasets import PathDataset
from mowl.projection import OWL2VecStarProjector
from mowl.projection.edge import Edge
from mowl.walking import DeepWalk
from mowl.kge import KGEModel

from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from src.gnn import *

In [2]:
experiments = [{'file_name' : 'family',
                'format_' : None},
               {'file_name' : 'family_noisy_gnn_100',
                'format_' : 'ttl'},
               {'file_name' : 'family_noisy_gnn_1000',
                'format_' : 'ttl'},
               #{'file_name' : 'family_noisy_gnn_10000',
               # 'format_' : 'ttl'},
               #{'file_name' : 'family_noisy_gnn_100000',
               # 'format_' : 'ttl'},   
               {'file_name' : 'family_noisy_random_100',
                'format_' : 'ttl'},
               {'file_name' : 'family_noisy_random_1000',
                'format_' : 'ttl'}#,
               #{'file_name' : 'family_noisy_random_10000',
               # 'format_' : 'ttl'},
               #{'file_name' : 'family_noisy_random_100000',
               #'format_' : 'ttl'}
              ]

# 1. Split ontology into train/test ontologies

In [3]:
def split_ontology(file_name, format_, train_ratio):
    g = rdflib.Graph()
    g.parse(f'datasets/{file_name}.owl', format=format_)  
    print(f'Triplets found: %d' % len(g))

    triples = list(g.triples((None, None, None))) 
    random.shuffle(triples) 

    split_index = int(train_ratio * len(triples))

    train_triples = triples[:split_index]
    test_triples = triples[split_index:]

    train_graph = rdflib.Graph()
    test_graph = rdflib.Graph()

    for triple in train_triples:
        train_graph.add(triple)

    for triple in test_triples:
        test_graph.add(triple)

    print(f'Train Triplets found: %d' % len(train_graph))
    train_graph.serialize(destination=f"datasets/bin/{file_name}_train.owl")
    print(f'Test Triplets found: %d' % len(test_graph))
    test_graph.serialize(destination=f"datasets/bin/{file_name}_test.owl")
    
    return train_graph, test_graph

# 2. OWL2Vec

**Fit**

In [4]:
def owl2vec_fit(file_name, embed_dim, load):
    dataset = PathDataset(ontology_path=f'datasets/bin/{file_name}_train.owl',
                          testing_path=f'datasets/bin/{file_name}_test.owl')
    if not load:
        projector = OWL2VecStarProjector(bidirectional_taxonomy=True)
        edges = projector.project(dataset.ontology)
        walker = DeepWalk(num_walks=20,
                          walk_length=20,
                          alpha=0.1,
                          workers=4)         
        walks = walker.walk(edges)
        sentences = LineSentence(walker.outfile)
        model = Word2Vec(sentences, vector_size=embed_dim, epochs=300, window=5, min_count=1, workers=4)
        model.save(f'models/owl2vec_{file_name}.model')
    else:
        model = Word2Vec.load(f'models/owl2vec_{file_name}.model')
    return model

**Eval**

In [5]:
def owl2vec_eval(owl2vec_model, test_graph):
    vectors = owl2vec_model.wv
    words = list(owl2vec_model.wv.key_to_index)
    output_owl2vec = torch.tensor(vectors[words])
    
    nodes = list(set(words))
    nodes_dict = {node: i for i, node in enumerate(nodes)}
    
    i=0
    edge_data = defaultdict(list)
    for s, p, o in test_graph.triples((None, None, None)):
        s = s.n3()
        s = s.replace('<','')
        s = s.replace('>','')
        o = o.n3()
        o = o.replace('<','')
        o = o.replace('>','')
        try:
            src, dst = nodes_dict[s], nodes_dict[o]
            edge_data['edge_index'].append([src, dst])
        except:
            i+=1
    edge_index = torch.tensor(edge_data['edge_index']).reshape(2,-1)
    
    hits1, hits10 = eval_hits(edge_index=edge_index,
                              tail_pred=1,
                              output=output_owl2vec,
                              max_num=100)
    print(f'Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')
    print()

**Experiments**

In [6]:
for experiment in experiments: 
    file_name = experiment['file_name']
    print(file_name)
    format_ = experiment['format_']

    train_graph, test_graph = split_ontology(file_name=file_name, format_=format_, train_ratio=0.8)
    owl2vec_model = owl2vec_fit(file_name=file_name, embed_dim=200, load=False)
    owl2vec_eval(owl2vec_model, test_graph)

family
Triplets found: 5017
Train Triplets found: 4013
Test Triplets found: 1004
Hits@1: 0.021, Hits@10: 0.142

family_noisy_gnn_100
Triplets found: 5817
Train Triplets found: 4653
Test Triplets found: 1164
Hits@1: 0.009, Hits@10: 0.112

family_noisy_gnn_1000
Triplets found: 13015
Train Triplets found: 10412
Test Triplets found: 2603
Hits@1: 0.046, Hits@10: 0.342

family_noisy_random_100
Triplets found: 5806
Train Triplets found: 4644
Test Triplets found: 1162
Hits@1: 0.035, Hits@10: 0.221

family_noisy_random_1000
Triplets found: 12861
Train Triplets found: 10288
Test Triplets found: 2573
Hits@1: 0.081, Hits@10: 0.459

