In [None]:
%matplotlib inline

# Model-theoretic ontology embedding methods
## EL Embeddings

This example corresponds to the paper [EL Embeddings: Geometric Construction of Models for the Description Logic EL++](https://www.ijcai.org/proceedings/2019/845).

The idea of this paper is to embed EL by modeling ontology classes as $n$-dimensional balls ($n$-balls) and ontology object properties as transformations of those $n$-balls. For each of the normal forms, there is a distance function defined that will work as loss functions in the optimization framework.


Let's just define the imports that will be needed along the example:



In [None]:
import mowl
mowl.init_jvm("10g")
import torch as th

In [None]:
from mowl.models.elembeddings.module import ELEmModule
from mowl.base_models.elmodel import EmbeddingELModel

The EL-Embeddings model, maps ontology classes, object properties and operators into a
geometric model. The $\mathcal{EL}$ description logic is expressed using the
following General Concept Inclusions (GCIs):

\begin{align}\begin{align}
   C &\sqsubseteq D & (\text{GCI 0}) \\
   C_1 \sqcap C_2 &\sqsubseteq D & (\text{GCI 1}) \\
   C &\sqsubseteq \exists R. D & (\text{GCI 2})\\
   \exists R. C &\sqsubseteq D & (\text{GCI 3})\\
   C &\sqsubseteq \bot & (\text{GCI BOT 0}) \\
   C_1 \sqcap C_2 &\sqsubseteq \bot & (\text{GCI BOT 1}) \\
   \exists R. C &\sqsubseteq \bot & (\text{GCI BOT 3})
   \end{align}\end{align}

where $C,C_1, C_2,D$ are ontology classes and $R$ is an ontology object property



## EL-Embeddings (PyTorch) module.

EL-Embeddings defines a geometric modelling for all the GCIs in the EL language.
The implementation of ELEmbeddings module can be found at :class:`mowl.nn.el.elem.module.ELEmModule`.

## EL-Embeddings model

The module :class:`mowl.nn.el.elem.module.ELEmModule` is used in the :class:`mowl.models.elembeddings.model.ELEmbeddings`.
In the use case of this example, we will test over a biological problem, which is
protein-protein interactions. Given two proteins $p_1,p_2$, the phenomenon
"$p_1$ interacts with $p_2$" is encoded using GCI 2 as:

\begin{align}p_1 \sqsubseteq interacts\_with. p_2\end{align}

For that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELEmPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.



## Model and Training Strategy

In [None]:
import torch
from torch import nn
from tqdm import trange
import numpy as np

class ELEmbeddings(EmbeddingELModel):

    def __init__(self,
                 dataset,
                 embed_dim=50,
                 margin=0,
                 reg_norm=1,
                 learning_rate=0.001,
                 epochs=1000,
                 batch_size=4096 * 8,
                 model_filepath=None,
                 device='cpu'
                 ):
        super().__init__(dataset, batch_size, extended=True, model_filepath=model_filepath)

        self.embed_dim = embed_dim
        self.margin = margin
        self.reg_norm = reg_norm
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.device = device
        self._loaded = False
        self._loaded_eval = False
        self.extended = False
        self.init_model()

    def init_model(self):
        self.model = ELEmModule(
            len(self.class_index_dict),  # number of ontology classes
            len(self.object_property_index_dict),  # number of ontology object properties
            embed_dim=self.embed_dim,
            margin=self.margin
        ).to(self.device)

    def train(self, checkpoint=1):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        best_loss = float('inf')

        for epoch in trange(self.epochs):
            self.model.train()

            train_loss = 0
            loss = 0

            # Notice how we use the ``training_datasets`` variable directly
            # and every element of it is a pair (GCI name, GCI tensor data).
            for gci_name, gci_dataset in self.training_datasets.items():
                if len(gci_dataset) == 0:
                    continue
                loss += torch.mean(self.model(gci_dataset[:], gci_name))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()
            torch.save(self.model.state_dict(), self.model_filepath)
            if (epoch + 1) % checkpoint == 0:
                print(f'\nEpoch {epoch}: Train loss: {train_loss:4f}')

Create the dataset class

In [None]:
from mowl.datasets import PathDataset

family_dataset = PathDataset('family.owl')

In [None]:
elembeddings = ELEmbeddings(family_dataset,
                     embed_dim=2,
                     margin=0.1,
                     reg_norm=1,
                     learning_rate=0.01,
                     epochs=1000,
                     batch_size=2,
                     model_filepath=None,
                     device='cpu')

elembeddings.train(checkpoint=100)

Extract embeddings

In [None]:
embeds = elembeddings.model.class_embed.weight.cpu().detach().numpy()
rs = np.abs(elembeddings.model.class_rad.weight.cpu().detach().numpy())
classes = list(elembeddings.class_index_dict.keys())
rs, embeds

Plot embeddings

In [None]:
import matplotlib.pyplot as plt

classes = [item.split('/')[-1] for item in classes]
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
fig, ax =  plt.subplots()
plt.axis('equal')
ax.set_xlim(-5, 4)
ax.set_ylim(-3, 4)
for i in range(embeds.shape[0]):
    if classes[i].endswith('hing'):
        continue
    x, y = embeds[i, 0], embeds[i, 1]
    r = rs[i]
    ax.add_artist(plt.Circle(
        (x, y), r, fill=False, edgecolor=colors[i % len(colors)], label=classes[i]))
    ax.annotate(classes[i], xy=(x, y + r + 0.03), fontsize=10, ha="center", color=colors[i % len(colors)])
ax.grid(True)
plt

## Training the model



In [None]:
from mowl.datasets.builtin import PPIYeastSlimDataset
from mowl.models.elembeddings.examples.model_ppi import ELEmPPI

dataset = PPIYeastSlimDataset()

In [None]:
model = ELEmPPI(dataset,
                embed_dim=30,
                margin=0.1,
                reg_norm=1,
                learning_rate=0.001,
                epochs=20,
                batch_size=4096,
                model_filepath=None,
                device='cpu')

model.train()

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [05:36<00:00, 16.83s/it]


1

## Evaluating the model

Now, it is time to evaluate embeddings. For this, we use the
:class:`ModelRankBasedEvaluator <mowl.evaluation.ModelRankBasedEvaluator>` class.



In [None]:
from mowl.evaluation.rank_based import ModelRankBasedEvaluator

with th.no_grad():
    model.load_best_model()
    evaluator = ModelRankBasedEvaluator(
        model,
        device = "cpu",
        eval_method = model.eval_method
    )

    evaluator.evaluate(show=True)

['http://4932.Q0010', 'http://4932.Q0017', 'http://4932.Q0032', 'http://4932.Q0045', 'http://4932.Q0050', 'http://4932.Q0055', 'http://4932.Q0060', 'http://4932.Q0065', 'http://4932.Q0070', 'http://4932.Q0075', 'http://4932.Q0080', 'http://4932.Q0085', 'http://4932.Q0092', 'http://4932.Q0105', 'http://4932.Q0110', 'http://4932.Q0115', 'http://4932.Q0120', 'http://4932.Q0130', 'http://4932.Q0140', 'http://4932.Q0142', 'http://4932.Q0143', 'http://4932.Q0160', 'http://4932.Q0182', 'http://4932.Q0250', 'http://4932.Q0255', 'http://4932.Q0275', 'http://4932.Q0297', 'http://4932.YAL001C', 'http://4932.YAL002W', 'http://4932.YAL003W', 'http://4932.YAL004W', 'http://4932.YAL005C', 'http://4932.YAL007C', 'http://4932.YAL008W', 'http://4932.YAL009W', 'http://4932.YAL010C', 'http://4932.YAL011W', 'http://4932.YAL012W', 'http://4932.YAL013W', 'http://4932.YAL014C', 'http://4932.YAL015C', 'http://4932.YAL016C-B', 'http://4932.YAL016W', 'http://4932.YAL017W', 'http://4932.YAL018C', 'http://4932.YAL

INFO:root:Training scores created
100%|████████████████████████████████████████████████████████████████████████████| 12040/12040 [02:49<00:00, 71.14it/s]

Hits@1:   0.00 Filtered:   0.00
Hits@10:  0.00 Filtered:   0.00
Hits@100: 0.02 Filtered:   0.02
MR:       2865.44 Filtered: 2810.56
AUC:      0.53 Filtered:   0.53
Evaluation finished. Access the results using the "metrics" attribute.



