In [1]:
from collections import defaultdict
import torch

import mowl
mowl.init_jvm('10g')
from mowl.datasets import PathDataset
from mowl.projection import OWL2VecStarProjector
from mowl.walking import DeepWalk

from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

from src.gnn import *
from src.utils import *

In [2]:
experiments1, experiments2, experiments3 = get_experimets()

**Fit**

In [3]:
def owl2vec_fit(file_name, embed_dim, run):
    dataset = PathDataset(ontology_path=f'datasets/bin/{file_name}_train.owl',
                          testing_path=f'datasets/bin/{file_name}_test.owl',
                          validation_path=f'datasets/bin/{file_name}_val.owl')
    if run:
        projector = OWL2VecStarProjector(bidirectional_taxonomy=True)
        edges = projector.project(dataset.ontology)
        
        walker = DeepWalk(num_walks=20,
                          walk_length=20,
                          alpha=0.1,
                          workers=4)         
        walks = walker.walk(edges)
        sentences = LineSentence(walker.outfile)
        model = Word2Vec(sentences, vector_size=embed_dim, epochs=500, window=5, min_count=1, workers=4)
        model.save(f'models/owl2vec_{file_name}.model')
        #triples_factory = Edge.as_pykeen(edges, create_inverse_triples=True)
        #trans_e = TransE(triples_factory=triples_factory, embedding_dim=embed_dim, random_seed=42)
        #model = KGEModel(triples_factory, trans_e, epochs=500, batch_size=32)
        #model.train()
    else:
        model = Word2Vec.load(f'models/owl2vec_{file_name}.model')
    return model

**Eval**

In [4]:
def owl2vec_eval(owl2vec_model, test_graph):
    vectors = owl2vec_model.wv
    words = list(owl2vec_model.wv.key_to_index)
    output_owl2vec = torch.tensor(vectors[words])
    
    nodes = list(set(words))
    nodes_dict = {node: i for i, node in enumerate(nodes)}
    
    i=0
    edge_data = defaultdict(list)
    for s, p, o in test_graph.triples((None, None, None)):
        s = s.n3()
        s = s.replace('<','')
        s = s.replace('>','')
        o = o.n3()
        o = o.replace('<','')
        o = o.replace('>','')
        try:
            src, dst = nodes_dict[s], nodes_dict[o]
            edge_data['edge_index'].append([src, dst])
        except:
            i+=1
    edge_index = torch.tensor(edge_data['edge_index']).reshape(2,-1)
    
    mrr, hits5, hits10 = eval_hits(edge_index=edge_index,
                                   tail_pred=1,
                                   output=output_owl2vec,
                                   max_num=100)
    print(f'MRR: {mrr:.3f}, Hits@5: {hits5:.3f}, Hits@10: {hits10:.3f}')

**Experiments**

In [5]:
for experiment in experiments3: 
    dataset_name = experiment['dataset_name']
    file_name = experiment['file_name']
    format_ = experiment['format_']
    add_noise = experiment['add_noise']

    train_graph, valid_graph, test_graph, test_membership_graph, test_subsumption_graph = split_ontology(dataset_name=dataset_name, 
                                                                                                         file_name=file_name, 
                                                                                                         format_=format_, 
                                                                                                         train_ratio=1, 
                                                                                                         add_noise=add_noise)
    owl2vec_model = owl2vec_fit(file_name=file_name, embed_dim=200, run=True)
    print('Membership:')
    owl2vec_eval(owl2vec_model, test_membership_graph)
    print('Subsumption:')
    owl2vec_eval(owl2vec_model, test_subsumption_graph)
    print()

Triplets found in OWL2DL-1.owl: 55215
Train Triplets found: 55215
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 82704 words, keeping 3447 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 164118 words, keeping 3928 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 245890 words, keeping 3928 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 328848 words, keeping 3928 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 409776 words, keeping 3928 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 492024 words, keeping 3928 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 574784 words, keeping 3928 word types
INFO:gensim.models.word2vec:collected 3928 word types from a corpus of 625322 raw words and 76320 s

Membership:
MRR: 0.069, Hits@5: 0.072, Hits@10: 0.152
Subsumption:
MRR: 0.021, Hits@5: 0.000, Hits@10: 0.000

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_gnn_100.owl: 4802
Train Triplets found: 60017
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 155486 words, keeping 3572 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 313672 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 471368 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 627980 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 784286 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 944632 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 1102196 words, keeping 3929 word types
INFO:gensim.models.word2vec:collected 3929 word types from a corpus of 1200924 raw words and 7634

Membership:
MRR: 0.088, Hits@5: 0.106, Hits@10: 0.279
Subsumption:
MRR: 0.020, Hits@5: 0.000, Hits@10: 0.000

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_gnn_1000.owl: 48050
Train Triplets found: 103265
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 184418 words, keeping 3691 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 370310 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 555828 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 743450 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 931654 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 1117106 words, keeping 3929 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 1302628 words, keeping 3929 word types
INFO:gensim.models.word2vec:collected 3929 word types from a corpus of 1420300 raw words and 763

Membership:
MRR: 0.061, Hits@5: 0.061, Hits@10: 0.108
Subsumption:
MRR: 0.088, Hits@5: 0.333, Hits@10: 0.444

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_random_100.owl: 4761
Train Triplets found: 59976
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 117420 words, keeping 3643 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 236242 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 355950 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 473154 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 591996 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 712654 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 828214 words, keeping 3933 word types
INFO:gensim.models.word2vec:collected 3933 word types from a corpus of 905414 raw words and 76420 

Membership:
MRR: 0.058, Hits@5: 0.068, Hits@10: 0.121
Subsumption:
MRR: 0.024, Hits@5: 0.000, Hits@10: 0.000

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_random_1000.owl: 47661
Train Triplets found: 102876
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 256530 words, keeping 3933 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 516660 words, keeping 3934 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 771864 words, keeping 3934 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 1027844 words, keeping 3934 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 1283168 words, keeping 3934 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 1539310 words, keeping 3934 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 1794688 words, keeping 3934 word types
INFO:gensim.models.word2vec:collected 3934 word types from a corpus of 1962808 raw words and 7

Membership:
MRR: 0.054, Hits@5: 0.057, Hits@10: 0.110
Subsumption:
MRR: 0.034, Hits@5: 0.000, Hits@10: 0.000

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_disjoint_100.owl: 6800
Train Triplets found: 62015
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 85466 words, keeping 3479 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 174522 words, keeping 3968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 260046 words, keeping 3968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 345978 words, keeping 3968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 434352 words, keeping 3968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 520500 words, keeping 3968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 606704 words, keeping 3968 word types
INFO:gensim.models.word2vec:collected 3968 word types from a corpus of 669480 raw words and 77120 s

Membership:
MRR: 0.068, Hits@5: 0.070, Hits@10: 0.130
Subsumption:
MRR: 0.022, Hits@5: 0.000, Hits@10: 0.000

Triplets found in OWL2DL-1.owl: 55215
Triplets found in OWL2DL-1_noisy_disjoint_1000.owl: 68000
Train Triplets found: 122434
Test Triplets (Membership) found: 7527
Test Triplets (Subsumption) found: 40


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 63690 words, keeping 3694 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 128246 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 192694 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 256750 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 320980 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 384842 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 450540 words, keeping 5044 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 514916 words, keeping 5044 word

Membership:
MRR: 0.054, Hits@5: 0.047, Hits@10: 0.087
Subsumption:
MRR: 0.032, Hits@5: 0.000, Hits@10: 0.111

