In [4]:
import pandas as pd
import numpy as np
from tqdm import trange, tqdm
import random
from collections import defaultdict
import torch
from torch.utils.data import DataLoader
from torch import nn
import rdflib

import mowl
mowl.init_jvm('10g')
from mowl.datasets import PathDataset, Dataset
from mowl.base_models import EmbeddingELModel
from mowl.models import ELEmbeddings
from mowl.nn import ELEmModule, ELBoxModule, BoxSquaredELModule

from src.utils import *

In [5]:
file_name = 'family'
format_ = None
train_graph, test_graph, valid_graph = split_ontology(file_name=file_name, format_=format_, train_ratio=0.8, test_ratio=0.2)

dataset = PathDataset(ontology_path=f'datasets/bin/{file_name}_train.owl',
                      testing_path=f'datasets/bin/{file_name}_test.owl',
                      validation_path=f'datasets/bin/{file_name}_test.owl')

Triplets found: 5017
Train Triplets found: 4013
Test Triplets found: 1004
Valid Triplets found: 0


In [6]:
train_ont = preprocess_ontology_el(dataset.ontology)
test_ont = preprocess_ontology_el(dataset.testing)
valid_ont = preprocess_ontology_el(dataset.validation)

dataset = Dataset(train_ont, testing=test_ont, validation=valid_ont)

In [57]:
class ElModel(ELEmbeddings):
    def __init__(self, dataset, module_name, dim, batch_size, epochs, learning_rate, model_filepath, device):
        self.module_name = module_name
        self.dim = dim
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.device = device
        self.nb_classes = len(dataset.classes)
        self.nb_roles = len(dataset.object_properties)
        
        if self.module_name == "elem":
            self.module = ELEmModule(self.nb_classes, self.nb_roles, embed_dim = self.dim)
        elif self.module_name == "elbox":
            self.module = ELBoxModule(self.nb_classes, self.nb_roles, embed_dim = self.dim)
        elif self.module_name == "box2el":
            self.module = BoxSquaredELModule(self.nb_classes, self.nb_roles, embed_dim = self.dim)
    
        super().__init__(dataset=dataset, embed_dim=dim, batch_size=batch_size, model_filepath=model_filepath)
                                                                                            
    def _train(self):
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.module.parameters(), lr=self.learning_rate)

        training_datasets = {k: v.data for k, v in self.training_datasets.items()}
        prots = [self.class_index_dict[p] for p in self.dataset.evaluation_classes.as_str]
        
        for epoch in trange(self.epochs):
            self.module.train()

            train_loss = 0
            loss = 0
            for gci_name, gci_dataset in training_datasets.items():
                if len(gci_dataset) == 0:
                    continue
                dst = self.module(gci_dataset, gci_name)
                mse_loss = criterion(dst, torch.zeros(dst.shape, requires_grad=False).to(self.device))
                loss += mse_loss

                if gci_name == "gci2":
                    gci_batch = gci_dataset
                    idxs_for_negs = np.random.choice(prots, size=len(gci_batch), replace=True)
                    rand_prot_ids = torch.tensor(idxs_for_negs).to(self.device)
                    neg_data = torch.cat([gci_batch[:, :2], rand_prot_ids.unsqueeze(1)], dim=1)
                    dst = self.module(neg_data, gci_name, neg=True)
                    mse_loss = criterion(dst, torch.ones(dst.shape, requires_grad=False).to(self.device))
                    loss += mse_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()
        
    def tbox_forward(self, *args, **kwargs):
        return self.module(*args, **kwargs)

    def abox_forward(self, ind_idxs):
        class_embed = self.module.class_center if self.module_name == "box2el" else self.module.class_embed
        all_class_embed = class_embed.weight
        ind_embed = self.ind_embeddings(ind_idxs)

        membership = torch.mm(ind_embed, all_class_embed.t())

        if self.module_name == "elem":
            rad_embed = self.el_module.class_rad.weight
            rad_embed = th.abs(rad_embed).view(1, -1)
            membership = membership + rad_embed
        elif self.module_name in ["elbox", "box2el"]:
            offset_embed = self.el_module.class_offset.weight
            offset_embed = torch.abs(offset_embed).mean(dim=1).view(1, -1)
            membership = membership + offset_embed

        return membership

In [58]:
elbox = ElModel(dataset, module_name='elbox', dim=200, batch_size=4096*8, epochs=500, learning_rate=0.001, 
                model_filepath=f'C:\\Users\\julie\\github\\PhD\\NoisyBench\\models\\elbox_{file_name}.pt' ,device='cpu')

In [160]:
elbox._train()

INFO:root:Reverse translation. Ignoring axiom: RangeAxiom*(66 64)
INFO:root:'uk.ac.manchester.cs.owl.owlapi.OWLObjectPropertyRa' object has no attribute 'getSubClass'
INFO:root:Reverse translation. Ignoring axiom: SubClassOf*(ObjectSomeValuesFrom(107 74) 70)
INFO:root:de.tudresden.inf.lat.jcel.owlapi.translator.TranslationException: The translation map is incomplete. Item id was not found: '70'.
INFO:root:Reverse translation. Ignoring axiom: RangeAxiom*(69 64)
INFO:root:'uk.ac.manchester.cs.owl.owlapi.OWLObjectPropertyRa' object has no attribute 'getSubClass'
INFO:root:Reverse translation. Ignoring axiom: RangeAxiom*(74 64)
INFO:root:'uk.ac.manchester.cs.owl.owlapi.OWLObjectPropertyRa' object has no attribute 'getSubClass'
INFO:root:Reverse translation. Ignoring axiom: RangeAxiom*(77 64)
INFO:root:'uk.ac.manchester.cs.owl.owlapi.OWLObjectPropertyRa' object has no attribute 'getSubClass'
INFO:root:Reverse translation. Ignoring axiom: RangeAxiom*(78 64)
INFO:root:'uk.ac.manchester.cs.owl

In [20]:
from mowl.utils.data import FastTensorDataLoader

In [21]:
all_tail_ids = torch.arange(len(dataset.classes))

In [31]:
#subsumption: class is subclass of class
all_head_ids = torch.arange(len(dataset.classes))
ds = elbox.testing_datasets["gci0"][:]
sub_class = ds[:, 0]
super_class = ds[:, 1]
eval_dl = FastTensorDataLoader(sub_class, super_class, batch_size=4096, shuffle=False)

In [29]:
mean_rank, filtered_mean_rank = 0, 0
ranks, filtered_ranks = dict(), dict()
rank_vals = []
filtered_rank_vals = []
mrr, filtered_mrr = 0, 0
hits_at_1, fhits_at_1 = 0, 0
hits_at_3, fhits_at_3 = 0, 0
hits_at_10, fhits_at_10 = 0, 0
hits_at_100, fhits_at_100 = 0, 0

In [43]:
heads = sub_class
tails = super_class

In [None]:
num_heads = len(heads)
tail_ids = torch.arange(len(dataset.classes))

heads = heads
heads = heads.repeat(len(tail_ids), 1).T
heads = heads.reshape(-1)
eval_tails = tail_ids.repeat(num_heads)

data = torch.stack((heads, eval_tails), dim=1)

In [60]:
elbox.module.tbox_forward(dataset, "gci0")

AttributeError: 'ELEmModule' object has no attribute 'tbox_forward'