In [1]:
from collections import defaultdict
import torch

import mowl
mowl.init_jvm('10g')
from mowl.datasets import PathDataset
from mowl.projection import OWL2VecStarProjector
from mowl.walking import DeepWalk

from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from src.gnn import *
from src.utils import *

In [2]:
experiments = [{'file_name' : 'family',
                'format_' : None,
                'add_noise': False},
               {'file_name' : 'family_noisy_gnn_100',
                'format_' : 'ttl',
                'add_noise': True},
               {'file_name' : 'family_noisy_gnn_1000',
                'format_' : 'ttl',
                'add_noise': True},
               #{'file_name' : 'family_noisy_gnn_10000',
               # 'format_' : 'ttl',
               # 'add_noise': True},
               #{'file_name' : 'family_noisy_gnn_100000',
               # 'format_' : 'ttl',
               # 'add_noise': True},
               {'file_name' : 'family_noisy_random_100',
                'format_' : 'ttl',
                'add_noise': True},
               {'file_name' : 'family_noisy_random_1000',
                'format_' : 'ttl',
                'add_noise': True}#,
               #{'file_name' : 'family_noisy_random_10000',
               # 'format_' : 'ttl'},
               #{'file_name' : 'family_noisy_random_100000',
               #'format_' : 'ttl'}
              ]

**Fit**

In [3]:
def owl2vec_fit(file_name, embed_dim, load):
    dataset = PathDataset(ontology_path=f'datasets/bin/{file_name}_train.owl',
                          testing_path=f'datasets/bin/{file_name}_test.owl')
    if not load:
        projector = OWL2VecStarProjector(bidirectional_taxonomy=True)
        edges = projector.project(dataset.ontology)
        walker = DeepWalk(num_walks=20,
                          walk_length=20,
                          alpha=0.1,
                          workers=4)         
        walks = walker.walk(edges)
        sentences = LineSentence(walker.outfile)
        model = Word2Vec(sentences, vector_size=embed_dim, epochs=500, window=5, min_count=1, workers=4)
        model.save(f'models/owl2vec_{file_name}.model')
    else:
        model = Word2Vec.load(f'models/owl2vec_{file_name}.model')
    return model

**Eval**

In [4]:
def owl2vec_eval(owl2vec_model, test_graph):
    vectors = owl2vec_model.wv
    words = list(owl2vec_model.wv.key_to_index)
    output_owl2vec = torch.tensor(vectors[words])
    
    nodes = list(set(words))
    nodes_dict = {node: i for i, node in enumerate(nodes)}
    
    i=0
    edge_data = defaultdict(list)
    for s, p, o in test_graph.triples((None, None, None)):
        s = s.n3()
        s = s.replace('<','')
        s = s.replace('>','')
        o = o.n3()
        o = o.replace('<','')
        o = o.replace('>','')
        try:
            src, dst = nodes_dict[s], nodes_dict[o]
            edge_data['edge_index'].append([src, dst])
        except:
            i+=1
    edge_index = torch.tensor(edge_data['edge_index']).reshape(2,-1)
    
    hits1, hits10 = eval_hits(edge_index=edge_index,
                              tail_pred=1,
                              output=output_owl2vec,
                              max_num=100)
    print(f'Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')
    print()

**Experiments**

In [5]:
for experiment in experiments: 
    file_name = experiment['file_name']
    format_ = experiment['format_']
    add_noise = experiment['add_noise']

    train_graph, test_graph, valid_graph = split_ontology(file_name=file_name, format_=format_, train_ratio=0.8, test_ratio=0.2, add_noise=add_noise)
    owl2vec_model = owl2vec_fit(file_name=file_name, embed_dim=200, load=False)
    owl2vec_eval(owl2vec_model, test_graph)

Triplets found in family.owl: 5017
Train Triplets found: 4013
Test Triplets found: 1004
Valid Triplets found: 0
Hits@1: 0.034, Hits@10: 0.184

Triplets found in family.owl: 5017
Triplets found in family_noisy_gnn_100.owl: 800
Train Triplets found: 4813
Test Triplets found: 1004
Valid Triplets found: 0
Hits@1: 0.004, Hits@10: 0.108

Triplets found in family.owl: 5017
Triplets found in family_noisy_gnn_1000.owl: 7997
Train Triplets found: 12010
Test Triplets found: 1004
Valid Triplets found: 0
Hits@1: 0.030, Hits@10: 0.155

Triplets found in family.owl: 5017
Triplets found in family_noisy_random_100.owl: 778
Train Triplets found: 4791
Test Triplets found: 1004
Valid Triplets found: 0
Hits@1: 0.045, Hits@10: 0.208

Triplets found in family.owl: 5017
Triplets found in family_noisy_random_1000.owl: 7841
Train Triplets found: 11854
Test Triplets found: 1004
Valid Triplets found: 0
Hits@1: 0.029, Hits@10: 0.181

