In [1]:
from collections import defaultdict
import torch

import mowl
mowl.init_jvm('10g')
from mowl.datasets import PathDataset
from mowl.projection import OWL2VecStarProjector
from mowl.walking import DeepWalk

from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

from src.gnn import *
from src.utils import *

In [2]:
experiments = [{'file_name' : 'family',
                'format_' : None,
                'add_noise': False},
               {'file_name' : 'family_noisy_gnn_100',
                'format_' : None,
                'add_noise': True},
               {'file_name' : 'family_noisy_gnn_1000',
                'format_' : None,
                'add_noise': True},
               #{'file_name' : 'family_noisy_gnn_10000',
               # 'format_' : None,
               # 'add_noise': True},
               #{'file_name' : 'family_noisy_gnn_100000',
               # 'format_' : None,
               # 'add_noise': True},
               {'file_name' : 'family_noisy_random_100',
                'format_' : None,
                'add_noise': True},
               {'file_name' : 'family_noisy_random_1000',
                'format_' : None,
                'add_noise': True}#,
               #{'file_name' : 'family_noisy_random_10000',
               # 'format_' : None,
               # 'add_noise': True},
               #{'file_name' : 'family_noisy_random_100000',
               # 'format_' : None,
               # 'add_noise': True}
              ]

**Fit**

In [3]:
def owl2vec_fit(file_name, embed_dim, run):
    dataset = PathDataset(ontology_path=f'datasets/bin/{file_name}_train.owl',
                          testing_path=f'datasets/bin/{file_name}_test.owl')
    if run:
        projector = OWL2VecStarProjector(bidirectional_taxonomy=True)
        edges = projector.project(dataset.ontology)
        
        walker = DeepWalk(num_walks=20,
                          walk_length=20,
                          alpha=0.1,
                          workers=4)         
        walks = walker.walk(edges)
        sentences = LineSentence(walker.outfile)
        model = Word2Vec(sentences, vector_size=embed_dim, epochs=500, window=5, min_count=1, workers=4)
        model.save(f'models/owl2vec_{file_name}.model')
        
        #triples_factory = Edge.as_pykeen(edges, create_inverse_triples=True)
        #trans_e = TransE(triples_factory=triples_factory, embedding_dim=embed_dim, random_seed=42)
        #model = KGEModel(triples_factory, trans_e, epochs=500, batch_size=32)
        #model.train()
    else:
        model = Word2Vec.load(f'models/owl2vec_{file_name}.model')
    return model

**Eval**

In [4]:
def owl2vec_eval(owl2vec_model, test_graph):
    vectors = owl2vec_model.wv
    words = list(owl2vec_model.wv.key_to_index)
    output_owl2vec = torch.tensor(vectors[words])
    
    nodes = list(set(words))
    nodes_dict = {node: i for i, node in enumerate(nodes)}
    
    i=0
    edge_data = defaultdict(list)
    for s, p, o in test_graph.triples((None, None, None)):
        s = s.n3()
        s = s.replace('<','')
        s = s.replace('>','')
        o = o.n3()
        o = o.replace('<','')
        o = o.replace('>','')
        try:
            src, dst = nodes_dict[s], nodes_dict[o]
            edge_data['edge_index'].append([src, dst])
        except:
            i+=1
    edge_index = torch.tensor(edge_data['edge_index']).reshape(2,-1)
    
    mrr, hits5, hits10 = eval_hits(edge_index=edge_index,
                                   tail_pred=1,
                                   output=output_owl2vec,
                                   max_num=100)
    print(f'MRR: {mrr:.3f}, Hits@5: {hits5:.3f}, Hits@10: {hits10:.3f}')

**Experiments**

In [5]:
for experiment in experiments: 
    file_name = experiment['file_name']
    format_ = experiment['format_']
    add_noise = experiment['add_noise']

    train_graph, valid_graph, test_graph, test_membership_graph, test_subsumption_graph, test_link_prediction_graph = split_ontology(file_name=file_name, 
                                                                                                                                     format_=format_, 
                                                                                                                                     train_ratio=1, 
                                                                                                                                     add_noise=add_noise)
    owl2vec_model = owl2vec_fit(file_name=file_name, embed_dim=200, run=True)
    print('Membership:')
    owl2vec_eval(owl2vec_model, test_membership_graph)
    print('Subsumption:')
    owl2vec_eval(owl2vec_model, test_subsumption_graph)
    print('Link Prediction:')
    owl2vec_eval(owl2vec_model, test_link_prediction_graph)
    print()

Triplets found in family.owl: 5017
Train Triplets found: 5017
Valid Triplets found: 0
Test Triplets found: 3891
Test Triplets (Membership) found: 1721
Test Triplets (Subsumption) found: 116
Test Triplets (Link Prediction) found: 2054


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 152078 words, keeping 648 word types
INFO:gensim.models.word2vec:collected 648 word types from a corpus of 172486 raw words and 11280 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 648 unique words (100.00% of original 648, drops 0)', 'datetime': '2024-05-06T17:40:10.574992', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 172486 word corpus (100.00% of original 172486, drops 0)', 'datetime': '2024-05-06T17:40:10.575990', 'gensim': '4.3.1', 'python': '3.8.16 (default

Membership:
MRR: 0.063, Hits@5: 0.078, Hits@10: 0.146
Subsumption:
MRR: 0.036, Hits@5: 0.035, Hits@10: 0.096
Link Prediction:
MRR: 0.043, Hits@5: 0.056, Hits@10: 0.056

Triplets found in family.owl: 5017
Triplets found in family_noisy_gnn_100.owl: 800
Train Triplets found: 5817
Valid Triplets found: 0
Test Triplets found: 3891
Test Triplets (Membership) found: 1721
Test Triplets (Subsumption) found: 116
Test Triplets (Link Prediction) found: 2054


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 142268 words, keeping 649 word types
INFO:gensim.models.word2vec:collected 649 word types from a corpus of 160610 raw words and 11300 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 649 unique words (100.00% of original 649, drops 0)', 'datetime': '2024-05-06T17:41:13.892598', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 160610 word corpus (100.00% of original 160610, drops 0)', 'datetime': '2024-05-06T17:41:13.892598', 'gensim': '4.3.1', 'python': '3.8.16 (default

Membership:
MRR: 0.087, Hits@5: 0.138, Hits@10: 0.231
Subsumption:
MRR: 0.055, Hits@5: 0.088, Hits@10: 0.123
Link Prediction:
MRR: 0.031, Hits@5: 0.056, Hits@10: 0.111

Triplets found in family.owl: 5017
Triplets found in family_noisy_gnn_1000.owl: 7998
Train Triplets found: 13015
Valid Triplets found: 0
Test Triplets found: 3891
Test Triplets (Membership) found: 1721
Test Triplets (Subsumption) found: 116
Test Triplets (Link Prediction) found: 2054


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 226718 words, keeping 650 word types
INFO:gensim.models.word2vec:collected 650 word types from a corpus of 256164 raw words and 11320 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 650 unique words (100.00% of original 650, drops 0)', 'datetime': '2024-05-06T17:42:17.848735', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 256164 word corpus (100.00% of original 256164, drops 0)', 'datetime': '2024-05-06T17:42:17.848735', 'gensim': '4.3.1', 'python': '3.8.16 (default

Membership:
MRR: 0.101, Hits@5: 0.128, Hits@10: 0.206
Subsumption:
MRR: 0.051, Hits@5: 0.070, Hits@10: 0.114
Link Prediction:
MRR: 0.048, Hits@5: 0.056, Hits@10: 0.056

Triplets found in family.owl: 5017
Triplets found in family_noisy_random_100.owl: 783
Train Triplets found: 5800
Valid Triplets found: 0
Test Triplets found: 3891
Test Triplets (Membership) found: 1721
Test Triplets (Subsumption) found: 116
Test Triplets (Link Prediction) found: 2054


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 304984 words, keeping 650 word types
INFO:gensim.models.word2vec:collected 650 word types from a corpus of 344562 raw words and 11320 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 650 unique words (100.00% of original 650, drops 0)', 'datetime': '2024-05-06T17:43:36.560264', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 344562 word corpus (100.00% of original 344562, drops 0)', 'datetime': '2024-05-06T17:43:36.561248', 'gensim': '4.3.1', 'python': '3.8.16 (default

Membership:
MRR: 0.077, Hits@5: 0.106, Hits@10: 0.194
Subsumption:
MRR: 0.038, Hits@5: 0.035, Hits@10: 0.114
Link Prediction:
MRR: 0.036, Hits@5: 0.056, Hits@10: 0.056

Triplets found in family.owl: 5017
Triplets found in family_noisy_random_1000.owl: 7839
Train Triplets found: 12856
Valid Triplets found: 0
Test Triplets found: 3891
Test Triplets (Membership) found: 1721
Test Triplets (Subsumption) found: 116
Test Triplets (Link Prediction) found: 2054


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 390000 words, keeping 650 word types
INFO:gensim.models.word2vec:collected 650 word types from a corpus of 441480 raw words and 11320 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 650 unique words (100.00% of original 650, drops 0)', 'datetime': '2024-05-06T17:45:14.336953', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 441480 word corpus (100.00% of original 441480, drops 0)', 'datetime': '2024-05-06T17:45:14.336953', 'gensim': '4.3.1', 'python': '3.8.16 (default

Membership:
MRR: 0.079, Hits@5: 0.092, Hits@10: 0.167
Subsumption:
MRR: 0.043, Hits@5: 0.061, Hits@10: 0.105
Link Prediction:
MRR: 0.036, Hits@5: 0.056, Hits@10: 0.111

