# OWL2Vec*

In [None]:
import sys
sys.path.append("../../../")

import torch as th
import logging

import mowl
mowl.init_jvm("2g")
from mowl.datasets.ppi_yeast import PPIYeastSlimDataset
from mowl.datasets.base import PathDataset

from mowl.embeddings.graph_based.owl2vec.model import OWL2VecStar
from gensim.models import Word2Vec
import pickle as pkl
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


logging.basicConfig(level=logging.INFO)

## OWL2Vec*

## OWL2Vec* with Deepwalk

In [None]:
#dataset = PPIYeastSlimDataset()
dataset = PathDataset("data/mp.owl", None, None)

model = OWL2VecStar(
    dataset,
    "data/owl2vec_deepwalk",
    bidirectional_taxonomy=True,
    include_literals = False,
    only_taxonomy = True,
    walking_method = "deepwalk",
    walk_length=50,
    vector_size = 100,
    num_walks=32,
    alpha=0,
    window=5,
    wv_epochs = 5,
    workers = 16,
    walks_outfile = "data/walks_qwer"
)

In [None]:
model.train()

In [None]:
word2VecModel = Word2Vec.load("data/owl2vec_deepwalk")

In [None]:
vectors = word2VecModel.wv

In [None]:
dl2vec_deepwalk_embeddings = {}
for node in model.entities:
    if node.startswith("4932"):
        dl2vec_deepwalk_embeddings[node] = vectors[node]

with open("data/protDl2vecD_emb", "wb") as file:
    pkl.dump(dl2vec_deepwalk_embeddings, file)

In [None]:
ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '{0}'.format(prot_id)
            ec_numbers[prot_id] = it[4]

In [None]:
ec_dict = {}
for prot in ec_numbers:
    if prot in dl2vec_deepwalk_embeddings:
        ec_dict[prot] = dl2vec_deepwalk_embeddings[prot]

size = model.vector_size
embeds = np.zeros((len(ec_dict), size), dtype=np.float32)

for i, emb in enumerate(ec_dict.values()):
    embeds[i, :] = emb
nodemap = {}
for i, m in enumerate(ec_dict.keys()):
    nodemap[i] = m


X = TSNE(n_components=2, verbose=1, n_iter=5000, n_jobs=8).fit_transform(embeds)

In [None]:
classes = {'0': [[], []]}
for item in nodemap.items():
    k, v = item
    if v in ec_numbers:
        ec = ec_numbers[v].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(X[k, 0])
        classes[ec][1].append(X[k, 1])

colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots(figsize=(20, 20))

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)
plt.savefig('data/fig_dl2vec_deepwalk.jpg')
plt.show()

## DL2Vec with Node2Vec

In [None]:
dataset = PPIYeastSlimDataset()

model = DL2Vec(
    dataset,
    "data/dl2vec_node2vec",
    bidirectional_taxonomy=True,
    walking_method = "node2vec",
    walk_length=50,
    vector_size = 100,
    num_walks=32,
    p = 10,
    q = 0.1,
    window=5,
    num_procs = 16
)

In [None]:
model.train()

In [None]:
word2VecModel = Word2Vec.load("data/dl2vec_node2vec")
vectors = word2VecModel.wv

dl2vec_node2vec_embeddings = {}
for node in model.entities:
    if node.startswith("4932"):
        dl2vec_node2vec_embeddings[node] = vectors[node]

with open("data/protDl2vecN_emb", "wb") as file:
    pkl.dump(dl2vec_node2vec_embeddings, file)

In [None]:
ec_dict = {}
for prot in ec_numbers:
    if prot in dl2vec_node2vec_embeddings:
        ec_dict[prot] = dl2vec_node2vec_embeddings[prot]

size = model.vector_size
embeds = np.zeros((len(ec_dict), size), dtype=np.float32)

for i, emb in enumerate(ec_dict.values()):
    embeds[i, :] = emb
nodemap = {}
for i, m in enumerate(ec_dict.keys()):
    nodemap[i] = m


XN = TSNE(n_components=2, verbose=1, n_iter=5000, n_jobs=8).fit_transform(embeds)

In [None]:
classes = {'0': [[], []]}
for item in nodemap.items():
    k, v = item
    if v in ec_numbers:
        ec = ec_numbers[v].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(XN[k, 0])
        classes[ec][1].append(XN[k, 1])

colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots(figsize=(20, 20))

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)
plt.savefig('data/fig_dl2vec_node2vec.jpg')

plt.show()