# mOWL: Python library for machine learning with ontologies

## Ontology creation:
To get started, you can install mOWL using pip:

In [None]:
!pip install mowl-borg==0.1.1 pystow==0.4.3

mOWL interfaces the OWL API. For this, we need to interface with the Java Virtual Machine (JVM):

In [None]:
import mowl
mowl.init_jvm("10g")

In [None]:
import os
from java.util import HashSet
from mowl.owlapi import OWLAPIAdapter
from org.semanticweb.owlapi.model import IRI

## Let's create our first ontology

In [None]:
adapter = OWLAPIAdapter()
ontology = adapter.create_ontology("http://mowl/family")

## Class names

In [None]:
male = adapter.create_class("http://Male")
female = adapter.create_class("http://Female")
parent = adapter.create_class("http://Parent")
person = adapter.create_class("http://Person")
mother = adapter.create_class("http://Mother")
father = adapter.create_class("http://Father")
sibling = adapter.create_class("http://Sibling")
brother = adapter.create_class("http://Brother")
sister = adapter.create_class("http://Sister")
son = adapter.create_class("http://Son")
daughter = adapter.create_class("http://Daughter")

## Role names

In [None]:
has_child = adapter.create_object_property("http://hasChild")
has_parent = adapter.create_object_property("http://hasParent")

## Individual names

In [None]:
John = adapter.create_individual("http://John")
Jane = adapter.create_individual("http://Jane")
Robert = adapter.create_individual("http://Robert")
Melissa = adapter.create_individual("http://Melissa")

## Axioms

Let's create some axioms of the form $A \sqsubseteq B$.

In [None]:
axioms = HashSet()
axioms.add(adapter.create_subclass_of(male, person))
axioms.add(adapter.create_subclass_of(female, person))
axioms.add(adapter.create_subclass_of(parent, person))
axioms.add(adapter.create_subclass_of(mother, female))
axioms.add(adapter.create_subclass_of(father, male))

Now, let's create some axioms of the form $A \sqcap B \sqsubseteq C$.

In [None]:
parent_and_male = adapter.create_object_intersection_of(parent, male)
axioms.add(adapter.create_subclass_of(parent_and_male, father))
parent_and_female = adapter.create_object_intersection_of(parent, female)
axioms.add(adapter.create_subclass_of(parent_and_female, mother))

Now some axioms of the form $A \sqcup B \equiv C$.

In [None]:
male_or_female = adapter.create_object_union_of(male, female)
axioms.add(adapter.create_equivalent_classes(male_or_female, person))

One axiom of the form $\neg A \equiv  B$.

In [None]:
not_male = adapter.create_complement_of(male)
axioms.add(adapter.create_equivalent_classes(not_male, female))

One axiom of the form $A \sqsubseteq \exists R.B$.

In [None]:
has_child_person = adapter.create_object_some_values_from(has_child, person)
axioms.add(adapter.create_subclass_of(parent, has_child_person))

And finally, some assertion axioms of the form $C(a)$ and $R(a,b)$.

In [None]:
axioms.add(adapter.create_class_assertion(father, John))
axioms.add(adapter.create_class_assertion(mother, Jane))
axioms.add(adapter.create_class_assertion(male, Robert))
axioms.add(adapter.create_class_assertion(female, Melissa))
axioms.add(adapter.create_object_property_assertion(has_child, John, Robert))
axioms.add(adapter.create_object_property_assertion(has_child, Jane, Robert))
axioms.add(adapter.create_object_property_assertion(has_child, John, Melissa))
axioms.add(adapter.create_object_property_assertion(has_child, Jane, Melissa))
adapter.owl_manager.addAxioms(ontology, axioms)

In [None]:
ont_file = os.path.abspath(f'family.owl')

In [None]:
adapter.owl_manager.saveOntology(ontology, IRI.create('file://'+ont_file))

# Ontology projections into graphs

Ontologies are formed by a TBox, an ABox and an RBox. A Knowledge
Graph can be easily extracted from the ABox and the RBox. However, to
encode the graph representation of the TBox, which is composed by
(complex) concept descriptions, many approaches have been developed. In mOWL, we provide some
methods that perform ontology projection into graphs:

- **Taxonomy projection**: the projection of axioms of the form $A
\sqsubseteq B$ as edges $(A, subclassof, B)$.

- **Taxonomy + relations**: the projection of axioms of the form $A
\sqsubseteq B$ and $A \sqsubseteq \exists R.B$ as edges $(A,
subclassof, B)$ and $(A, R, B)$, respectively.

- **DL2Vec projection**

- **OWL2Vec projection**

In [None]:
from mowl.projection import TaxonomyProjector, TaxonomyWithRelationsProjector, DL2VecProjector, OWL2VecStarProjector

In [None]:
#from mowl.datasets.builtin import FamilyDataset
from mowl.datasets import PathDataset
#dataset = FamilyDataset()
dataset = PathDataset("family.owl")
edges = TaxonomyProjector().project(dataset.ontology)

In [None]:
def nx_network(edges):
    import networkx as nx
    import matplotlib.pyplot as plt
    G = nx.DiGraph()
    for edge in edges:
        src = edge.src.split("/")[-1]
        dst = edge.dst.split("/")[-1]
        G.add_edge(src, dst)
    #nx draw with custom colors
    plt.figure(figsize=(5,5))
    pos = nx.spring_layout(G)
    nx.draw(G, pos, edge_color='black', width=1, linewidths=1,
            node_size=500, node_color='cyan', alpha=0.9,
            labels={node:node for node in G.nodes()})
    #nx.draw(G, with_labels=True)
    plt.show()

In [None]:
nx_network(edges)

In [None]:
dl2vec_proj = DL2VecProjector(bidirectional_taxonomy=True)
d2v_edges = dl2vec_proj.project(dataset.ontology, with_individuals=True)

In [None]:
nx_network(d2v_edges)

In [None]:
owl2vec_proj = OWL2VecStarProjector(bidirectional_taxonomy=True)
o2v_edges = owl2vec_proj.project(dataset.ontology)

In [None]:
nx_network(o2v_edges)

# Random-walk-based embeddings of ontologies

After generating the graph, we can embed it in different ways. Two approaches are supported in mOWL:
- Embeddings based on random walks
- Embeddings based on KGE

Let's try the approach with random walks.

In [None]:
from mowl.walking.deepwalk.model import DeepWalk
walker =  DeepWalk(
             10, #num_walks,
             4, #walk_length,
             0.1, #alpha
             outfile = "walks_dw.txt", # /optional/path/to/save/walks,
             workers = 4)
walker.walk(o2v_edges)

## Process the walks using Word2Vec

In [None]:
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

walk_corpus_file = walker.outfile
sentences = LineSentence(walk_corpus_file)

w2v_model = Word2Vec(sentences, vector_size = 15) #vector_size/size

In [None]:
w2v_model.wv.most_similar("http://Jane")

## Task 🚧

We will add a new axiom in the ontology:  $hasChild(John,Bob)$. We know that Jhon and Jane already have two children. Can we infer that Bob is also realted to Jane?

In [None]:
Bob = adapter.create_individual("http://Bob")
axioms.add(adapter.create_object_property_assertion(has_child, John, Bob))
adapter.owl_manager.addAxioms(ontology, axioms)
ont_file = os.path.abspath(f'family2.owl')
adapter.owl_manager.saveOntology(ontology, IRI.create('file://'+ont_file))

In [None]:
dataset = PathDataset("family2.owl")
projector = OWL2VecStarProjector(bidirectional_taxonomy=True) # YOUR CODE HERE
new_edges = projector.project(dataset.ontology)

In [None]:
walker =  DeepWalk(
             10, #number of walks,
             5, #walk length,
             0.1, #alpha: restart parameter
             outfile = "walks_dw.txt", # /optional/path/to/save/walks,
             workers = 4)

walker.walk(new_edges)
walk_corpus_file = walker.outfile
sentences = LineSentence(walk_corpus_file)
w2v_model = Word2Vec(sentences, vector_size=5)

In [None]:
w2v_model.wv.most_similar("http://Jane")

# Syntactic embeddings of ontologies

Syntactic embeddings embedding uses the syntax of axioms to generate sentences out of them. mOWL provides methods to generate text sentences from the axioms and/or the annotations in the ontology. The syntax chosen to generate the sentences is [Manchester Syntax](https://www.w3.org/2007/OWL/draft/ED-owl2-manchester-syntax-20081128/).

In [None]:
import mowl
mowl.init_jvm("10g")

We import our `Family Ontology` and the method `extract_axiom_corpus`, which extracts the axioms from the ontology and generates sentences in *Manchester Syntax*.

In [None]:
from mowl.corpus import extract_axiom_corpus
from mowl.datasets import PathDataset
dataset = PathDataset("family.owl")
corpus = extract_axiom_corpus(dataset.ontology)
len(corpus)

Let's see the corpus generated:

In [None]:
for s in corpus[:10]:
    print(s)

Now it is possible to input this corpus in a model like Word2Vec, which will generate numerical representations for our vocabulary. We will use the `gensim` library to do this.

In [None]:
from gensim.models import Word2Vec

sentences = [s.split(" ") for s in corpus]
w2v = Word2Vec(sentences, epochs=200, vector_size = 50, min_count = 0)

Finally, we can provide a visual representation of the entities. We will use a modified version of TSNE, which is implemented here:

In [None]:
from mowl.visualization import TSNE as MTSNE
from sklearn.manifold import TSNE as SKTSNE
import numpy as np
import matplotlib.pyplot as plt

class TSNE(MTSNE):

    def __init__(self, *args, perplexity=5, thickness = 50,  **kwargs):
        super().__init__(*args, **kwargs)

        self.perplexity = perplexity
        self.thickness = thickness

    def generate_points(self, epochs, workers=1, verbose=0):
        """This method will call the :meth:`sklearn.manifold.TSNE.fit_transform`
        method to generate the points for the plot.

        :param epochs: Number of epochs to run the TSNE algorithm
        :type epochs: int
        :param workers: Number of workers to use for parallel processing. Defaults to 1.
        :type workers: int, optional
        :param verbose: Verbosity level. Defaults to 0.
        """
        points = np.array(list(self.embeddings.values()))
        if np.iscomplexobj(points):
            if verbose:
                warnings.warn("Complex numpy array detected. Only real part will be considered",
                              UserWarning)
            points = points.real
        self.points = SKTSNE(n_components=2, verbose=verbose, n_iter=epochs, n_jobs=workers, perplexity=self.perplexity)
        self.points = self.points.fit_transform(points)
        self.plot_data = {}

        for name, idx in self.embedding_idx_dict.items():
            label = self.labels[name]
            x, y = tuple(self.points[idx])

            if label not in self.plot_data:
                self.plot_data[label] = [], []
            self.plot_data[label][0].append(x)
            self.plot_data[label][1].append(y)

    def show(self, thickness = None):
        """ This method will call the :meth:`matplotlib.pyplot.show` method to show the plot.
        """
        if thickness is None:
            thickness = self.thickness
            fig, ax = plt.subplots(figsize=(15, 15))

        for label, (xs, ys) in self.plot_data.items():
            color = self.class_color_dict[label]
            ax.scatter(xs, ys, color=color, label=label, s=thickness)
            ax.text(xs[0]+0.5, ys[0]+0.5, label, fontsize=12)

            ax.legend()
            ax.grid(True)

        plt.show()


In [None]:
#from scripts.tsne import TSNE

vectors = w2v.wv
vocab_dict = vectors.key_to_index
name_to_label = {c: c.split("/")[-1] for c in vocab_dict if str(c).startswith("http://")}
name_to_emb = {c: vectors[[c]][0] for c in name_to_label}

tsne = TSNE(name_to_emb, name_to_label)
tsne.generate_points(500, workers=4)

In [None]:
tsne.show(thickness=300)

In [None]:
# built-in imports
import sys
import torch as th
import logging
import numpy as np
import pickle as pkl
from mowl.visualization.base import TSNE
import matplotlib.pyplot as plt
from mowl.projection.edge import Edge
from mowl.datasets.builtin import GDADataset, GDAHumanDataset, GDAMouseDataset
from pykeen.models import TransE,ConvE,DistMult,TransR,TransD
from mowl.projection.dl2vec.model import DL2VecProjector
from mowl.kge import KGEModel
from mowl.evaluation.rank_based import EmbeddingsRankBasedEvaluator
from mowl.evaluation.base import TranslationalScore, CosineSimilarity
from mowl.projection.factory import projector_factory, PARSING_METHODS
from mowl.walking import DeepWalk
from gensim.models.word2vec import LineSentence
import gensim
from gensim.models import Word2Vec
from mowl.evaluation.rank_based import EmbeddingsRankBasedEvaluator
from mowl.evaluation.base import CosineSimilarity
from mowl.projection import TaxonomyWithRelationsProjector
from mowl.projection.edge import Edge

import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
warnings.filterwarnings(action='ignore',category=FutureWarning,module='gensim')

# Dataset

Use the Built-in Dataset

In [None]:
dataset = GDAMouseDataset()

The dataset will be downloaded to a folder name `gda_mouse` with the training, validation and testing ontology dataset

In [None]:
! ls gda_mouse/

# Graph-based embeddings

 ### Example for two methods: DL2vec and Owl2vec* methods

<font color='blue'><font size="4">1) DL2vec Prediction Method </font></font>


1. **Projecting the ontology**
- Project the ontology using the DL2Vec Projector class, with the specific rules used to project the ontology.
- The outcome of the projection algorithm is an edgelist.



In [None]:
from mowl.projection.dl2vec.model import DL2VecProjector
projector = DL2VecProjector(True)
train_edges = projector.project(dataset.ontology)
test_edges = projector.project(dataset.testing)

2. **Generating random walks**
- The random walks are generated using the DeepWalk.


In [None]:
walker = DeepWalk(10, # number of walks per node
                  10, # walk length
                  0.1, # restart probability
                  workers=4, outfile = 'walk',seed=40) # number of threads

walks = walker.walk(train_edges)
walks_file = walker.outfile
sentences = LineSentence(walks_file)

3. **Training the Word2Vec model**
- To train the Word2Vec model, we rely on the Gensim library

In [None]:
model = Word2Vec(sentences, vector_size=100, epochs = 15, window=5, min_count=1, workers=10)

4. **Evaluating the embeddings**
- We are going to evaluate the plausibility of an association gene-disease with a gene against all possible diseases and check the rank of the true disease association using CosineSimilarity.


In [None]:
genes, diseases = dataset.evaluation_classes
projector = TaxonomyWithRelationsProjector(taxonomy=False,
                                           relations=["http://is_associated_with"])

vectors = model.wv
evaluator = EmbeddingsRankBasedEvaluator(
    vectors,
    test_edges,
    CosineSimilarity,
    training_set=train_edges,
    head_entities = genes.as_str,
    tail_entities = diseases.as_str,
    device = 'cpu')


evaluator.evaluate(show=True)

In [None]:
human_disease=[]
mouse_genes=[]
for classes in vectors.index_to_key:
    if 'OMIM' in classes:
        human_disease.append(classes)
    if classes[7:].isnumeric():
        mouse_genes.append(classes)

print(f'Number of the disease is {len(human_disease)}, and number of genes is {len(mouse_genes)}')

In [None]:
human_disease[:10]

In [None]:
mouse_genes[:10]

In [None]:
human_disease_vectors=[]
for k in human_disease:
    human_disease_vectors.append(vectors[k])

mouse_genes_vectors=[]
for k in mouse_genes:
    mouse_genes_vectors.append(vectors[k])

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(np.array(human_disease_vectors),np.array(mouse_genes_vectors))

print(f"The dimentions of this matrix is {similarity.shape}")

## Evaluating the predictions to find the most similar genes:

In [None]:
def find_similar_genes(disease_id, top_k, disease_genes_similarity_matrix, disease_keys, gene_keys):
    disease_index = disease_keys.index(disease_id)
    prediction_list = np.flip(np.argsort(disease_genes_similarity_matrix[disease_index]))
    top_genes = [gene_keys[prediction_list[x]] for x in range(top_k)]

    return top_genes

#associations from the file MGI_DO.rpt


#DOID:0080449	developmental and epileptic encephalopathy 16	OMIM:615338	mouse, laboratory	10090	Tbc1d24	224617	MGI:2443456
disease_id = 'http://OMIM_615338'
top_k = find_similar_genes(disease_id, 5 ,similarity, human_disease, mouse_genes )
print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')


#DOID:0080436	developmental and epileptic encephalopathy 4	OMIM:612164	human	9606	STXBP1	6812
#DOID:0060309	syndromic X-linked intellectual disability		human	9606	HNRNPH2	3188
#HNRNPH2	3188	Hnrnph2	MGI:1201779	MP:0001186, MP:0005386, MP:0010771
#disease_id = 'http://OMIM_612164'
#top_k = find_similar_genes(disease_id, 5 ,similarity, human_disease, mouse_genes )
#print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')


#OMIM_181500 : schizophrenia : DOID:5419	OMIM:181500	mouse, laboratory	10090	Magi2	50791	MGI:1354953
#disease_id = 'http://OMIM_181500'
#top_k = find_similar_genes(disease_id, 5 ,similarity, human_disease, mouse_genes )
#print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')


#OMIM_615643 : neurodegeneration with brain iron accumulation 6
#DOID:0110740	OMIM:615643	mouse, laboratory	10090	Coasy	71743	MGI:1918993
#disease_id = 'http://OMIM_615643'
#top_k = find_similar_genes(disease_id, 5 ,similarity, human_disease, mouse_genes )
#print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')

In [None]:
def find_similar_genes(disease_id, top_k, disease_genes_similarity_matrix, disease_keys, gene_keys):
    disease_index = disease_keys.index(disease_id)
    prediction_list = np.flip(np.argsort(disease_genes_similarity_matrix[disease_index]))
    top_genes = [gene_keys[prediction_list[x]] for x in range(top_k)]

    return top_genes

disease_id = 'http://OMIM_615643'
top_k = find_similar_genes(disease_id, 5 ,similarity, human_disease, mouse_genes )
print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')

-------------------------------------

# **Task 1 :**

<div class="alert alert-block alert-success" , color ='grreen'>

<font size="4">
    Predict the <font color='SteelBlue'>top 10 similar genes</font> to
    diabetes mellitus disease OMIM ID: <font color='Tomato'>http://OMIM_608036</font>
    using <font color='red'>OWL2vec*</font> prediction method

</font>

</div>


<div class="alert alert-block alert-info">
<b>Tip:</b> Follow the <b>TODO</b> interactions to modify the script, and the rest should be the same you just need to run the cell to execute the code.
</div>


----

<font color='blue'><font size="4">2) OWL2vec* Prediction Method </font></font>


1. **Projecting the ontology**
- Project the ontology using the OWL2Vec* Projector class, with the specific rules used to project the ontology.
- The outcome of the projection algorithm is an edgelist.


In [None]:
from mowl.projection import #TODO: import the appropriate function (refer to https://mowl.readthedocs.io/en/latest/api/projection/index.html)
dataset = GDAMouseDataset()
projector = OWL2VecStarProjector(True)
train_edges = projector.project(dataset.ontology)
test_edges = projector.project(dataset.testing)

2. **Generating random walks**
- The random walks are generated using the DeepWalk.


In [None]:
walker = DeepWalk( ,#TODO: add the number of walks per node
                   ,#TODO: add the walk length
                  workers=4, # number of threads
                  outfile = , #TODO: add the name of the output file for the walks
                  seed=40) #fix the random seed

walks = walker.walk(train_edges)
walks_file = walker.outfile
sentences = LineSentence(walks_file)

3. **Training the Word2Vec model**
- To train the Word2Vec model, we rely on the Gensim library

In [None]:
model = Word2Vec(sentences,
                 vector_size= , #TODO: add the size of the vector
                 epochs = ,     #TODO: update the number of training epochs
                 window=5, min_count=1, workers=10)

4. **Evaluating the embeddings**
- We are going to evaluate the plausibility of an association gene-disease with a gene against all possible diseases and check the rank of the true disease association using CosineSimilarity.


In [None]:
genes, diseases = dataset.evaluation_classes
projector = TaxonomyWithRelationsProjector(taxonomy=False,
                                           relations=["http://is_associated_with"])

vectors = model.wv
evaluator = EmbeddingsRankBasedEvaluator(
    vectors,
    test_edges,
    CosineSimilarity,
    training_set=train_edges,
    head_entities = genes.as_str,
    tail_entities = diseases.as_str,
    device = 'cuda')

evaluator.evaluate(show=True)

In [None]:
human_disease=[]
mouse_genes=[]
for classes in vectors.index_to_key:
    if 'OMIM' in classes:
        human_disease.append(classes)
    if classes[7:].isnumeric():
        mouse_genes.append(classes)

print(f'Number of the disease is {len(human_disease)}, and number of genes is {len(mouse_genes)}')

In [None]:
human_disease_vectors=[]
for k in human_disease:
    human_disease_vectors.append(vectors[k])

mouse_genes_vectors=[]
for k in mouse_genes:
    mouse_genes_vectors.append(vectors[k])

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(np.array(human_disease_vectors),np.array(mouse_genes_vectors))

print("the dimentions of this matrix is ", similarity.shape)

## Evaluating the predictions to find the most similar genes:

In [None]:
def find_similar_genes(disease_id, top_k, disease_genes_similarity_matrix, disease_keys, gene_keys):
    disease_index = disease_keys.index(disease_id)
    prediction_list = np.flip(np.argsort(disease_genes_similarity_matrix[disease_index]))
    top_genes = [gene_keys[prediction_list[x]] for x in range(top_k)]

    return top_genes



disease_id = #TODO: write the disease OMIM ID

number_of_genes =  #TODO: number of genes to be ranked

top_k = find_similar_genes( , #TODO: disease OMIM ID
                            , #TODO: number of genes
                           similarity,
                           human_disease,
                           mouse_genes)

print(f'The top {number_of_genes} most similar gene to disease {disease_id.split("/")[2]} are:')

for idx, genes in enumerate(top_k):
    print(f" Gene in Rank {idx+1} is : {top_k[idx]}")

----------------------------------------

# Syntactic embeddings

In [None]:
from mowl.corpus import extract_and_save_axiom_corpus
from mowl.owlapi import OWLAPIAdapter
from mowl.reasoning import MOWLReasoner
from org.semanticweb.elk.owlapi import ElkReasonerFactory
from java.util import HashSet
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
import os

<font color='blue'><font size="4">1) Onto2Vec Prediction Method </font></font>


This example corresponds to the paper **Onto2Vec: joint vector-based representation of biological entities and their ontology-based annotations**.

This method is an approach to learn numerical representations (embeddings) of (biomedical) ontologies by representing ontology axioms as text sequences and applying an unsupervised learning algorithm such as Word2Vec. Onto2Vec uses an ontology reasoner to infer new axioms as a preprocessing step. The algorithm is tested on the protein-protein interaction task.

For this algorithm, we need three components:

1. The reasoner

2. The corpus generator

3. The Word2Vec model


**1) Inferring new axioms**

- Onto2Vec uses an ontology reasoner to infer new axioms as a preprocessing step. In the original paper, the authors used the HermiT reasoner. For this example, we use the ELK reasoner.


In [None]:
reasoner_factory = ElkReasonerFactory()
reasoner = reasoner_factory.createReasoner(dataset.ontology)
mowl_reasoner = MOWLReasoner(reasoner)

In [None]:
# We wrap the reasoner into the :class:`MOWLReasoner <mowl.reasoning.base.MOWLReasoner>` class \
# in order to use some shortcuts the mOWL
# provides such as:
#
# - inferring subclass axioms
# - inferring equivalent class axioms
# - inferring disjoint axioms (not applicable for this example since we use ELK reasoner)

classes = dataset.ontology.getClassesInSignature()
subclass_axioms = mowl_reasoner.infer_subclass_axioms(classes)
equivalent_class_axioms = mowl_reasoner.infer_equivalent_class_axioms(classes)

In [None]:
# We can now add the inferred axioms to the ontology.

adapter = OWLAPIAdapter()
manager = adapter.owl_manager

axioms = HashSet()
axioms.addAll(subclass_axioms)
axioms.addAll(equivalent_class_axioms)

manager.addAxioms(dataset.ontology, axioms)

**2- The corpus generator**

In [None]:
extract_and_save_axiom_corpus(dataset.ontology, "onto2vec_corpus.txt")

**3- Generating the corpus and training the model**
- Now that we have an extended ontology, we can generate the corpus out of it. After that, we can train the Word2Vec model.


In [None]:
sentences = LineSentence("onto2vec_corpus.txt")
model_onto = Word2Vec(sentences, vector_size=5, epochs=10, window=2, min_count=1, workers=4)

In [None]:
# Cleaning up memory
# os.remove("onto2vec_corpus.txt")

## Evaluating the embeddings
- We are going to evaluate the plausibility of an association gene-disease with a gene against all possible diseases and check the rank of the true disease association using CosineSimilarity.


In [None]:
genes, diseases = dataset.evaluation_classes
projector = TaxonomyWithRelationsProjector(taxonomy=False,
                                           relations=["http://is_associated_with"])

vectors = model_onto.wv
evaluator = EmbeddingsRankBasedEvaluator(
    vectors,
    test_edges,
    CosineSimilarity,
    training_set=train_edges,
    head_entities = genes.as_str,
    tail_entities = diseases.as_str,
    device = 'cpu')

evaluator.evaluate(show=True)

In [None]:
human_disease=[]
mouse_genes=[]
for classes in vectors.index_to_key:
    if 'OMIM' in classes:
        human_disease.append(classes)
    if classes[7:].isnumeric():
        mouse_genes.append(classes)

human_disease_vectors=[]
for k in human_disease:
    human_disease_vectors.append(vectors[k])

mouse_genes_vectors=[]
for k in mouse_genes:
    mouse_genes_vectors.append(vectors[k])

similarity = cosine_similarity(np.array(human_disease_vectors),np.array(mouse_genes_vectors))

disease_id = 'http://OMIM_612164'
top_k = find_similar_genes(disease_id, 10 ,similarity, human_disease, mouse_genes )
print(f'The most similar gene to disease {disease_id.split("/")[2]} are: {top_k}')

----

# **Task 2 :**

<div class="alert alert-block alert-success">

<font size="4">
    Predict the <font color='SteelBlue'>top 5 similar genes</font> to
    diabetes mellitus disease OMIM ID: <font color='Tomato'>http://OMIM_608036</font>
    using <font color='red'>OPA2Vec*</font> prediction method

</font>

</div>

<div class="alert alert-block alert-info">
<b>Tip:</b> Follow the <b>TODO</b> interactions to modify the script, and the rest should be the same you just need to run the cell to execute the code.
</div>


---

<font color='blue'><font size="4">2) OPA2Vec Prediction Method </font></font>


This example corresponds to the paper **OPA2Vec: combining formal and informal content of biomedical ontologies to improve similarity-based prediction**.

This method is an extension of **Onto2Vec** that apart from formal knowldege (i.e. axioms) it also uses informal knowledge such as entity metadata (i.e. synonyms, definitions, etc.)

For this algorithm, we need four components:

1.  The reasoner

2. The corpus generator

<font color='red'>3. The annotations generator</font>

4. The Word2Vec model

In [None]:
from mowl.corpus import extract_and_save_axiom_corpus
from mowl.owlapi import OWLAPIAdapter
from mowl.reasoning import MOWLReasoner
from org.semanticweb.elk.owlapi import ElkReasonerFactory
from java.util import HashSet
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
import os
from sklearn.metrics.pairwise import cosine_similarity

from mowl.corpus import extract_and_save_axiom_corpus, extract_and_save_annotation_corpus
# OPA2Vec use annotation so we need to import extract_and_save_annotation_corpus


**1) Inferring new axioms**

- OPA2Vec uses an ontology reasoner to infer new axioms as a preprocessing step. In the original paper, the authors used the HermiT reasoner. For this example, we use the ELK reasoner.



In [None]:
reasoner_factory = ElkReasonerFactory()
reasoner = reasoner_factory.createReasoner(dataset.ontology)
mowl_reasoner = MOWLReasoner(reasoner)

We wrap the reasoner into the **MOWLReasoner** class in order to use some shortcuts the mOWL provides such as:

- inferring subclass axioms

- inferring equivalent class axioms

- inferring disjoint axioms (not applicable for this example since we use ELK reasoner)


In [None]:
classes = dataset.ontology.getClassesInSignature()
subclass_axioms = mowl_reasoner.infer_subclass_axioms(classes)
equivalent_class_axioms = mowl_reasoner.infer_equivalent_class_axioms(classes)

We can now add the inferred axioms to the ontology:

**2- The corpus generator**

In [None]:
adapter = OWLAPIAdapter()
manager = adapter.owl_manager

axioms = HashSet()
axioms.addAll(subclass_axioms)
axioms.addAll(equivalent_class_axioms)

manager.addAxioms(dataset.ontology, axioms)

In [None]:
extract_and_save_axiom_corpus(dataset.ontology, "opa2vec_corpus.txt")

<font color='red'> **3- The annotations generator**</font>

In [None]:
#TODO : extract and save the annotations (dataset.ontology, "opa2vec_corpus.txt", mode="a")

**4- Generating the corpus and training the model**
- Now that we have an extended ontology, we can generate the corpus out of it. After that, we can train the Word2Vec model.



In [None]:
sentences = LineSentence("opa2vec_corpus.txt")

model = Word2Vec(sentences,
                 vector_size= , #TODO: add the size of the vector
                 epochs = ,     #TODO: update the number of training epochs
                 window=5, min_count=1, workers=10)

## Evaluating the embeddings

In [None]:
genes, diseases = dataset.evaluation_classes
projector = TaxonomyWithRelationsProjector(taxonomy=False,
                                           relations=["http://is_associated_with"])

eval_train_edges = projector.project(dataset.ontology)
eval_test_edges = projector.project(dataset.testing)

vectors = model.wv
evaluator = EmbeddingsRankBasedEvaluator(
    vectors,
    eval_test_edges,
    CosineSimilarity,
    training_set=eval_train_edges,
    head_entities = genes.as_str,
    tail_entities = diseases.as_str,
    device = 'cpu')

evaluator.evaluate(show=True)

In [None]:
def find_similar_genes(disease_id, top_k, disease_genes_similarity_matrix, disease_keys, gene_keys):
    disease_index = disease_keys.index(disease_id)
    prediction_list = np.flip(np.argsort(disease_genes_similarity_matrix[disease_index]))
    top_genes = [gene_keys[prediction_list[x]] for x in range(top_k)]

    return top_genes


vectors = model.wv


human_disease=[]
mouse_genes=[]
for classes in vectors.index_to_key:
    if 'OMIM' in classes:
        human_disease.append(classes)
    if 'http://' in classes and classes[7:].isnumeric():
        mouse_genes.append(classes)

human_disease_vectors=[]
for k in human_disease:
    human_disease_vectors.append(vectors[k])

mouse_genes_vectors=[]
for k in mouse_genes:
    mouse_genes_vectors.append(vectors[k])

similarity = cosine_similarity(np.array(human_disease_vectors),np.array(mouse_genes_vectors))

In [None]:
disease_id =  #TODO: write the disease OMIM ID

number_of_genes = #TODO: number of genes to be ranked

top_k = find_similar_genes( , #TODO: disease OMIM ID
                            , #TODO: number of genes
                           similarity,
                           human_disease,
                           mouse_genes)

print(f'The top {number_of_genes} most similar gene to disease {disease_id.split("/")[2]} are:')

for idx, genes in enumerate(top_k):
    print(f" Gene in Rank {idx+1} is : {top_k[idx]}")

------

# Ontologies and Text-mining

Genes and Diseases extracted from text using the exact match, refer to the notebook [Ontologies_and_text_mining](https://github.com/bio-ontology-research-group/mowl-tutorial/blob/main/notebooks/02_Ontologies_and_text_mining.ipynb), to see how the model trained, and for more details refer to the presentation [Ontologies and Text-mining](https://github.com/bio-ontology-research-group/mowl-tutorial/blob/main/slides/Ontologies%20and%20text%20mining.pdf).


In [None]:
from gensim.models import Word2Vec
import pickle as pkl
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Collect the vectors for the genes and diseases
word2vec_file = 'w2v_model/wv_model'
vectors = Word2Vec.load(word2vec_file)

human_disease=[]
mouse_genes=[]
for classes in vectors.wv.index_to_key:
    if 'OMIM_' in classes:
        human_disease.append(classes)
    if 'http://' in classes and classes[7:].isnumeric():
        mouse_genes.append(classes)

human_disease_vectors=[]
for k in human_disease:
    human_disease_vectors.append(vectors.wv[k])

mouse_genes_vectors=[]
for k in mouse_genes:
    mouse_genes_vectors.append(vectors.wv[k])

similarity = cosine_similarity(np.array(human_disease_vectors),np.array(mouse_genes_vectors))

## Evaluating the predictions to find the most similar genes:

In [None]:
# Evaluating the similarity

def find_similar_genes(disease_id, top_k, disease_genes_similarity_matrix, disease_keys, gene_keys):
    disease_index = disease_keys.index(disease_id)
    prediction_list = np.flip(np.argsort(disease_genes_similarity_matrix[disease_index]))
    top_genes = [gene_keys[prediction_list[x]] for x in range(top_k)]

    return top_genes

disease_id = 'http://OMIM_114500'
# DOID:9256	colorectal cancer	OMIM:114500	human	9606	KDR	3791


top_k = find_similar_genes(disease_id, 10 ,similarity, human_disease, mouse_genes )
print(f'The most similar genes to disease {disease_id.split("/")[2]} are:')
for i, j in enumerate(top_k):
    print(i+1, j)

# The linked genes at rank 7



# Model-theoretic ontology embedding methods

## EL-Embeddings

Import MOWL library and ELEmbedding model base classes

In [None]:
from mowl.models.elembeddings.module import ELEmModule
from mowl.base_models.elmodel import EmbeddingELModel

Define the model and training strategy

In [None]:
import torch
from torch import nn
from tqdm import trange
import numpy as np

class ELEmbeddings(EmbeddingELModel):

    def __init__(self,
                 dataset,
                 embed_dim=50,
                 margin=0,
                 reg_norm=1,
                 learning_rate=0.001,
                 epochs=1000,
                 batch_size=4096 * 8,
                 model_filepath=None,
                 device='cpu'
                 ):
        super().__init__(dataset, batch_size, extended=True, model_filepath=model_filepath)

        self.embed_dim = embed_dim
        self.margin = margin
        self.reg_norm = reg_norm
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.device = device
        self._loaded = False
        self._loaded_eval = False
        self.extended = False
        self.init_model()

    def init_model(self):
        self.model = ELEmModule(
            len(self.class_index_dict),  # number of ontology classes
            len(self.object_property_index_dict),  # number of ontology object properties
            embed_dim=self.embed_dim,
            margin=self.margin
        ).to(self.device)

    def train(self, checkpoint=1):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        best_loss = float('inf')

        for epoch in trange(self.epochs):
            self.model.train()

            train_loss = 0
            loss = 0

            # Notice how we use the ``training_datasets`` variable directly
            # and every element of it is a pair (GCI name, GCI tensor data).
            for gci_name, gci_dataset in self.training_datasets.items():
                if len(gci_dataset) == 0:
                    continue
                loss += torch.mean(self.model(gci_dataset[:], gci_name))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()
            torch.save(self.model.state_dict(), self.model_filepath)
            if (epoch + 1) % checkpoint == 0:
                print(f'\nEpoch {epoch}: Train loss: {train_loss:4f}')

Create the dataset class

In [None]:
from mowl.datasets import PathDataset

family_dataset = PathDataset('family.owl')

Train the model

In [None]:
elembeddings = ELEmbeddings(family_dataset,
                     embed_dim=2,
                     margin=0.1,
                     reg_norm=1,
                     learning_rate=0.01,
                     epochs=1000,
                     batch_size=2,
                     model_filepath=None,
                     device='cpu')

elembeddings.train(checkpoint=100)

Extract embeddings

In [None]:
embeds = elembeddings.model.class_embed.weight.cpu().detach().numpy()
rs = np.abs(elembeddings.model.class_rad.weight.cpu().detach().numpy())
classes = list(elembeddings.class_index_dict.keys())
rs, embeds

Plot embeddings

In [None]:
import matplotlib.pyplot as plt

classes = [item.split('/')[-1] for item in classes]
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
fig, ax =  plt.subplots()
plt.axis('equal')
ax.set_xlim(-5, 4)
ax.set_ylim(-3, 4)
for i in range(embeds.shape[0]):
    if classes[i].endswith('hing'):
        continue
    x, y = embeds[i, 0], embeds[i, 1]
    r = rs[i]
    ax.add_artist(plt.Circle(
        (x, y), r, fill=False, edgecolor=colors[i % len(colors)], label=classes[i]))
    ax.annotate(classes[i], xy=(x, y + r + 0.03), fontsize=10, ha="center", color=colors[i % len(colors)])
ax.grid(True)
plt