In [1]:
import numpy as np
import pandas as pd
import os
import click as ck

import torch

from collections import defaultdict

import mowl
mowl.init_jvm('10g')

from mowl.datasets import PathDataset
from mowl.corpus import extract_and_save_axiom_corpus
from mowl.owlapi import OWLAPIAdapter, OWLObjectPropertyAssertionAxiom
from mowl.reasoning import MOWLReasoner
from mowl.projection import OWL2VecStarProjector
from mowl.projection.edge import Edge
from mowl.walking import DeepWalk, Node2Vec
from mowl.kge import KGEModel

from pykeen.models import TransE

from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

import rdflib

from org.semanticweb.elk.owlapi import ElkReasonerFactory
from java.util import HashSet

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve, auc, matthews_corrcoef

import random
import operator

**TransE**

In [41]:
dataset = PathDataset('../datasets/family.owl')
projector = OWL2VecStarProjector(bidirectional_taxonomy=True, 
                                 include_literals=False,
                                 only_taxonomy =True)
edges = projector.project(dataset.ontology)
triples_factory = Edge.as_pykeen(edges, create_inverse_triples = False)
pk_model = TransE(triples_factory = triples_factory, embedding_dim = 128, random_seed = 42)
model = KGEModel(triples_factory, pk_model, epochs = 10, batch_size = 32)
model.train()
ent_embs = model.class_embeddings_dict
rel_embs = model.object_property_embeddings_dict

Training epochs on cpu:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/49 [00:00<?, ?batch/s]

In [55]:
len(ent_embs.keys())

564

**Owl2Vec**

In [2]:
def owl2vec_fit(owl_file, embed_dim, load):
    dataset = PathDataset(f'../datasets/{owl_file}.owl')
    if not load:
        projector = OWL2VecStarProjector(bidirectional_taxonomy=True)
        edges = projector.project(dataset.ontology)
        walker = DeepWalk(num_walks=20,
                          walk_length=20,
                          alpha=0.1,
                          workers=4)         
        walks = walker.walk(edges)
        sentences = LineSentence(walker.outfile)
        model = Word2Vec(sentences, vector_size=embed_dim, epochs=10, window=5, min_count=1, workers=4)
        model.save(f'../models/owl2vec.model')
    else:
        model = Word2Vec.load(f'../models/owl2vec.model')
    return model

In [3]:
model = owl2vec_fit('family', 128, False)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 152980 words, keeping 648 word types
INFO:gensim.models.word2vec:collected 648 word types from a corpus of 173510 raw words and 11280 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 648 unique words (100.00% of original 648, drops 0)', 'datetime': '2024-03-12T09:51:27.751726', 'gensim': '4.3.1', 'python': '3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 173510 word corpus (100.00% of original 173510, drops 0)', 'datetime': '2024-03-12T09:51:27.752723', 'gensim': '4.3.1', 'python': '3.8.16 (default

In [59]:
###output = node embeddings, we can compute hits@k
###split train test

In [86]:
dataset.testing

In [61]:


node_embed_owl2vec = {}
for word in words:
    node_embed_owl2vec[word] = vectors[word]

In [65]:
len(node_embed_owl2vec.keys())

1

In [69]:
len(np.unique(words))

648

In [17]:
############################################################################

In [18]:
vectors = model.wv
words = list(model.wv.key_to_index)
output_owl2vec = torch.tensor(vectors[words])

In [19]:
g = rdflib.Graph()
g.parse('datasets/family.owl')

nodes = list(set(words))
nodes_dict = {node: i for i, node in enumerate(nodes)}
nodes_dict_rev = {value: key for key, value in nodes_dict.items()}

In [20]:
i=0
edge_data = defaultdict(list)
for s, p, o in g.triples((None, None, None)):
    s = s.n3()
    s = s.replace('<','')
    s = s.replace('>','')
    o = o.n3()
    o = o.replace('<','')
    o = o.replace('>','')
    try:
        src, dst = nodes_dict[s], nodes_dict[o]
        edge_data['edge_index'].append([src, dst])
    except:
        i+=1
edge_index = torch.tensor(edge_data['edge_index'])

In [21]:
hits1, hits10 = eval_hits(edge_index=edge_index,
                          tail_pred=1,
                          output=output_owl2vec,
                          max_num=edge_index.size(1))
print(f'Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')

Hits@1: 0.000, Hits@10: 1.000


In [29]:
split_ontology('datasets/family.owl', 0.8)