## Loading the graphs
We load the graph as a weighted undirected graph.

To load the graph we are using the sister package of Embiggen called [Ensmallen](https://github.com/LucaCappelletti94/ensmallen_graph). Ensmallen is a Rust library with python bindings to handle processing of graph files and preprocessing of data for quickly training embedding models.

In [1]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

In [2]:
graph_data_dir = "link_prediction_experiment_graph"

In [3]:
# get the graphs

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)
if not os.path.exists(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz"):
    with urllib.request.urlopen("https://zenodo.org/record/4011267/files/kg-covid-19-skipgram-aug-2020.tar.gz") as response, \
        open(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

os.system("tar -xvzf " + graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz -C " + graph_data_dir)

0

In [4]:
%%time
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path= graph_data_dir + "/merged-kg_edges.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False,
    node_path= graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column="id",
    node_types_column="category",
    default_node_type="biolink:NamedThing",
    ignore_duplicated_edges=True,
    ignore_duplicated_nodes=True,
    force_conversion_to_undirected=True
)

In [5]:
graph.report()

{'edges_number': '30861027',
 'is_directed': 'false',
 'unique_edge_types_number': '0',
 'traps_rate': '0.021906677500566116',
 'connected_components_number': '8976',
 'degrees_mean': '82.21604837957722',
 'degrees_mode': '1',
 'degrees_min': '0',
 'degrees_median': '6',
 'selfloops_rate': '0.000015391581103247148',
 'singleton_nodes': '8223',
 'density': '0.00021902960686152735',
 'strongly_connected_components_number': '8976',
 'unique_node_types_number': '36',
 'degrees_max': '90378',
 'nodes_number': '375365',
 'bidirectional_rate': '1'}

In [7]:
training, validation = graph.connected_holdout(42, 0.8)

In [8]:
training.report()

{'unique_edge_types_number': '0',
 'degrees_mean': '65.77283976929122',
 'is_directed': 'false',
 'selfloops_rate': '0.000014743514291609376',
 'unique_node_types_number': '36',
 'nodes_number': '375365',
 'degrees_min': '0',
 'edges_number': '24688822',
 'bidirectional_rate': '1',
 'traps_rate': '0.021933318236916067',
 'connected_components_number': '8976',
 'degrees_median': '5',
 'density': '0.0001752236883281372',
 'degrees_max': '71988',
 'singleton_nodes': '8233',
 'degrees_mode': '1',
 'strongly_connected_components_number': '8976'}

In [9]:
validation.report()

{'traps_rate': '0.42166957494705154',
 'density': '0.000043805918533390134',
 'unique_edge_types_number': '0',
 'edges_number': '6172205',
 'bidirectional_rate': '1',
 'degrees_mode': '0',
 'connected_components_number': '162705',
 'nodes_number': '375365',
 'degrees_max': '18390',
 'singleton_nodes': '158280',
 'selfloops_rate': '0.00001798384855979346',
 'degrees_min': '0',
 'unique_node_types_number': '36',
 'degrees_mean': '16.44320861028599',
 'strongly_connected_components_number': '162705',
 'degrees_median': '1',
 'is_directed': 'false'}

The followings are check that are not necessary, but are offered as sanity checks:

In [10]:
assert graph > training
assert graph > validation
assert (training + validation).contains(graph)
assert graph.contains(training + validation)
assert not training.overlaps(validation)
assert not validation.overlaps(training)

## Loading the embeddings

In [11]:
# https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy?download=1
embedding_dir = "link_prediction_experiment_embeddings"
embedding_file = os.path.join(embedding_dir, "SkipGram_embedding.npy")
os.makedirs(embedding_dir, exist_ok=True)

with urllib.request.urlopen("https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy") as response, \
    open(embedding_file, 'wb') as out_file:
        data = response.read()  # a `bytes` object
        out_file.write(data)

In [12]:
import numpy as np
embedding_file = "link_prediction_experiment_embeddings/SkipGram_embedding.npy"
embeddings = np.load(embedding_file)

In [13]:
node_names = list(np.array(training.nodes_reverse_mapping))

In [14]:
assert len(training.nodes_reverse_mapping) == len(embeddings)

In [15]:
assert len(training.node_types) == len(embeddings)

In [16]:
sars_cov_2_name = 'CHEMBL.TARGET:CHEMBL4303835'

In [17]:
sars_cov_2_idx = node_names.index(sars_cov_2_name)

In [18]:
drug_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:Drug'))[0])
drug_names = [node_names[i] for i in drug_idx]

In [19]:
chem_substance_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:ChemicalSubstance'))[0])
chem_substance_names = [node_names[i] for i in chem_substance_idx]

In [20]:
import re
chembl_prefix = 'CHEMBL.COMPOUND'
chembl_names = [x for x in node_names if (match := re.compile(chembl_prefix).search(x))]
chembl_idx = [index for index, x in enumerate(node_names) if (match := re.compile(chembl_prefix).search(x))]

## Loading trained MLP models

In [21]:
from tqdm.auto import tqdm
model_dir = "link_prediction_experiment_models"
os.makedirs(model_dir, exist_ok=True)

h5_files = ["SkipGram_weightedL2_finalized_model.h5", "SkipGram_weightedL1_finalized_model.h5", \
        "SkipGram_hadamard_finalized_model.h5", "SkipGram_average_finalized_model.h5"]
base_url = "https://zenodo.org/record/4031401/files/"

for file in tqdm(h5_files):
    with urllib.request.urlopen(base_url + file) as response, \
        open(os.path.join(model_dir, file), 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [22]:
import tensorflow as tf
mlp_model = ("average", tf.keras.models.load_model('link_prediction_experiment_models/SkipGram_average_finalized_model.h5'))

In [23]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

In [24]:
mlp_model[0]

'average'

In [27]:
from embiggen import GraphTransformer, EdgeTransformer

assert(mlp_model[0] in EdgeTransformer.methods)

transformer = GraphTransformer(mlp_model[0]) # pass edge embedding method, which is mlp_model[0]
transformer.fit(embeddings)
train_edges = transformer.transform(training)
assert(training.get_edges_number() == len(train_edges))

In [28]:
# let's try to predict a link that should exist in training graph
# example SARS-CoV-2 -> ChEMBL compound edge (which should be positive)
example_chembl_edge = train_edges[training.get_edge_id(sars_cov_2_idx, chembl_idx[0])]
example_chembl_edge.shape
example_chembl_edge.__class__
print(example_chembl_edge.shape, example_chembl_edge.reshape(1, -1).shape)
mlp_model[1].predict(example_chembl_edge.reshape(1, -1))

(100,) (1, 100)


array([[0.93010485]], dtype=float32)

In [None]:
# previous value array([[0.93010485]], dtype=float32)

In [None]:
# I think here we need to make a new graph with a drug -> SARS-CoV-2 edge for every member of drug_idx