## Loading the graphs
We need to load the graphs and redo the training/test split exactly as we did when generating the embeddings in order to retrieve the labels for the embedddings

In [2]:
!pip install ensmallen_graph==0.3.6

Collecting ensmallen_graph==0.3.6
  Using cached https://files.pythonhosted.org/packages/99/52/8d632ec5f850824a4e9c062bcb899062396831434319ac548681fa753561/ensmallen_graph-0.3.6-cp37-cp37m-macosx_10_7_x86_64.whl
Collecting toml~=0.10.0 (from ensmallen_graph==0.3.6)
  Using cached https://files.pythonhosted.org/packages/9f/e1/1b40b80f2e1663a6b9f497123c11d7d988c0919abbf3c3f2688e448c5363/toml-0.10.1-py2.py3-none-any.whl
Installing collected packages: toml, ensmallen-graph
Successfully installed ensmallen-graph-0.3.6 toml-0.10.1
[33mYou are using pip version 19.0.3, however version 20.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
graph_data_dir = "data"

In [5]:
# Get the graphs from Zenodo. This zenodo upload also contains embeddings from an unrelated experiment.
# We are using a different (better) set of embeddings, so below

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)
if not os.path.exists(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz"):
    with urllib.request.urlopen("https://zenodo.org/record/4011267/files/kg-covid-19-skipgram-aug-2020.tar.gz") as response, \
        open(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

os.system("tar -xvzf " + graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz -C " + graph_data_dir)

0

In [6]:
%%time
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path= graph_data_dir + "/merged-kg_edges.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False,
    default_edge_type="biolink:association",
    node_path= graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column="id",
    node_types_column="category",
    default_node_type="biolink:NamedThing",
    ignore_duplicated_edges=True,
    ignore_duplicated_nodes=True,
    force_conversion_to_undirected=True
)

In [7]:
graph.report()

{'singleton_nodes': '8223',
 'selfloops_rate': '0.000015391581103247148',
 'nodes_number': '375365',
 'bidirectional_rate': '1',
 'strongly_connected_components_number': '8976',
 'is_directed': 'false',
 'degrees_median': '6',
 'edges_number': '30861027',
 'degrees_mean': '82.21604837957722',
 'unique_edge_types_number': '0',
 'traps_rate': '0.021906677500566116',
 'connected_components_number': '8976',
 'degrees_max': '90378',
 'degrees_min': '0',
 'degrees_mode': '1',
 'density': '0.00021902960686152735',
 'unique_node_types_number': '36'}

In [8]:
training, validation = graph.connected_holdout(42, 0.8)

In [9]:
training.report()

{'edges_number': '24688822',
 'traps_rate': '0.021933318236916067',
 'connected_components_number': '8976',
 'unique_edge_types_number': '0',
 'degrees_mean': '65.77283976929122',
 'singleton_nodes': '8233',
 'strongly_connected_components_number': '8976',
 'degrees_min': '0',
 'nodes_number': '375365',
 'degrees_median': '5',
 'degrees_max': '71988',
 'bidirectional_rate': '1',
 'selfloops_rate': '0.000014743514291609376',
 'density': '0.0001752236883281372',
 'degrees_mode': '1',
 'is_directed': 'false',
 'unique_node_types_number': '36'}

In [10]:
validation.report()

{'degrees_max': '18390',
 'unique_node_types_number': '36',
 'singleton_nodes': '158280',
 'degrees_min': '0',
 'connected_components_number': '162705',
 'edges_number': '6172205',
 'nodes_number': '375365',
 'degrees_mode': '0',
 'is_directed': 'false',
 'degrees_median': '1',
 'traps_rate': '0.42166957494705154',
 'selfloops_rate': '0.00001798384855979346',
 'strongly_connected_components_number': '162705',
 'unique_edge_types_number': '0',
 'degrees_mean': '16.44320861028599',
 'bidirectional_rate': '1',
 'density': '0.000043805918533390134'}

The followings checks are not strictly necessary, but are offered as sanity checks:

In [11]:
assert graph > training
assert graph > validation
assert (training + validation).contains(graph)
assert graph.contains(training + validation)
assert not training.overlaps(validation)
assert not validation.overlaps(training)

## Loading the embeddings

In [None]:
# https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy?download=1
embedding_dir = "link_prediction_experiment_embeddings"
embedding_file = os.path.join(embedding_dir, "SkipGram_embedding.npy")
os.makedirs(embedding_dir, exist_ok=True)

with urllib.request.urlopen("https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy") as response, \
    open(embedding_file, 'wb') as out_file:
        data = response.read()  # a `bytes` object
        out_file.write(data)

In [None]:
import numpy as np
embedding_file = "link_prediction_experiment_embeddings/SkipGram_embedding.npy"
embeddings = np.load(embedding_file)

In [None]:
node_names = list(np.array(training.nodes_reverse_mapping))

In [None]:
assert len(training.nodes_reverse_mapping) == len(embeddings)

In [None]:
assert len(training.node_types) == len(embeddings)

In [None]:
sars_cov_2_name = 'CHEMBL.TARGET:CHEMBL4303835'

In [None]:
sars_cov_2_idx = node_names.index(sars_cov_2_name)

In [None]:
drug_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:Drug'))[0])
drug_names = [node_names[i] for i in drug_idx]

In [None]:
chem_substance_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:ChemicalSubstance'))[0])
chem_substance_names = [node_names[i] for i in chem_substance_idx]

In [None]:
chembl_prefix = 'CHEMBL.COMPOUND'
chembl_names = [x for x in node_names if (match := re.compile(chembl_prefix).search(x))]
chembl_idx = [index for index, x in enumerate(node_names) if (match := re.compile(chembl_prefix).search(x))]

## Loading trained MLP models

In [None]:
from tqdm import tqdm
model_dir = "link_prediction_experiment_models"
os.makedirs(model_dir, exist_ok=True)

h5_files = ["SkipGram_weightedL2_finalized_model.h5", "SkipGram_weightedL1_finalized_model.h5", \
        "SkipGram_hadamard_finalized_model.h5", "SkipGram_average_finalized_model.h5"]
base_url = "https://zenodo.org/record/4031401/files/"

for file in tqdm(h5_files):
    with urllib.request.urlopen(base_url + file) as response, \
        open(os.path.join(model_dir, file), 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

In [None]:
import tensorflow as tf
mlp_model = ("average", tf.keras.models.load_model('link_prediction_experiment_models/SkipGram_average_finalized_model.h5'))

In [None]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

In [None]:
from embiggen import GraphTransformer, EdgeTransformer

assert(mlp_model[0] in EdgeTransformer.methods)

transformer = GraphTransformer(mlp_model[0]) # pass edge embedding method, which is mlp_model[0]
transformer.fit(embeddings)
train_edges = transformer.transform(training)
assert(training.get_edges_number() == len(train_edges))

In [None]:
# let's try to predict a link that should exist in training graph
# example SARS-CoV-2 -> ChEMBL compound edge (which should be positive)
example_chembl_edge = train_edges[training.get_edge_id(sars_cov_2_idx, chembl_idx[0])]
example_chembl_edge.shape
example_chembl_edge.__class__
mlp_model[1].predict(example_chembl_edge)