## Loading the graphs
We load the graph as a weighted undirected graph.

To load the graph we are using the sister package of Embiggen called [Ensmallen](https://github.com/LucaCappelletti94/ensmallen_graph). Ensmallen is a Rust library with python bindings to handle processing of graph files and preprocessing of data for quickly training embedding models.

In [145]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

In [146]:
graph_data_dir = "link_prediction_experiment_graph"

In [147]:
# get the graphs

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)
if not os.path.exists(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz"):
    with urllib.request.urlopen("https://zenodo.org/record/4011267/files/kg-covid-19-skipgram-aug-2020.tar.gz") as response, \
        open(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

os.system("tar -xvzf " + graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz -C " + graph_data_dir)

0

In [148]:
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path= graph_data_dir + "/merged-kg_edges.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False,
    default_edge_type="biolink:association",
    node_path= graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column="id",
    node_types_column="category",
    default_node_type="biolink:NamedThing",
    ignore_duplicated_edges=True,
    ignore_duplicated_nodes=True,
    force_conversion_to_undirected=True
)

In [149]:
graph.report()

{'degrees_mode': '1',
 'singleton_nodes': '8223',
 'nodes_number': '375365',
 'strongly_connected_components_number': '8976',
 'unique_node_types_number': '36',
 'degrees_max': '90378',
 'degrees_median': '6',
 'bidirectional_rate': '1',
 'unique_edge_types_number': '0',
 'traps_rate': '0.021906677500566116',
 'connected_components_number': '8976',
 'degrees_mean': '82.21604837957722',
 'edges_number': '30861027',
 'density': '0.00021902960686152735',
 'selfloops_rate': '0.000015391581103247148',
 'is_directed': 'false',
 'degrees_min': '0'}

In [150]:
training, validation = graph.connected_holdout(42, 0.8)

In [151]:
training.report()

{'strongly_connected_components_number': '8976',
 'nodes_number': '375365',
 'traps_rate': '0.021933318236916067',
 'degrees_median': '5',
 'edges_number': '24688822',
 'degrees_min': '0',
 'degrees_max': '71988',
 'singleton_nodes': '8233',
 'unique_node_types_number': '36',
 'connected_components_number': '8976',
 'degrees_mean': '65.77283976929122',
 'density': '0.0001752236883281372',
 'selfloops_rate': '0.000014743514291609376',
 'bidirectional_rate': '1',
 'degrees_mode': '1',
 'is_directed': 'false',
 'unique_edge_types_number': '0'}

In [152]:
validation.report()

{'strongly_connected_components_number': '162705',
 'selfloops_rate': '0.00001798384855979346',
 'connected_components_number': '162705',
 'unique_edge_types_number': '0',
 'degrees_max': '18390',
 'degrees_median': '1',
 'is_directed': 'false',
 'degrees_mode': '0',
 'traps_rate': '0.42166957494705154',
 'degrees_min': '0',
 'unique_node_types_number': '36',
 'nodes_number': '375365',
 'degrees_mean': '16.44320861028599',
 'singleton_nodes': '158280',
 'edges_number': '6172205',
 'bidirectional_rate': '1',
 'density': '0.000043805918533390134'}

The followings are check that are not necessary, but are offered as sanity checks:

In [153]:
assert graph > training
assert graph > validation
assert (training + validation).contains(graph)
assert graph.contains(training + validation)
assert not training.overlaps(validation)
assert not validation.overlaps(training)

## Loading the embeddings

In [160]:
# https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy?download=1
embedding_dir = "link_prediction_experiment_embeddings"
embedding_file = os.path.join(embedding_dir, "SkipGram_embedding.npy")
os.makedirs(embedding_dir, exist_ok=True)

with urllib.request.urlopen("https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy") as response, \
    open(embedding_file, 'wb') as out_file:
        data = response.read()  # a `bytes` object
        out_file.write(data)

In [161]:
import numpy as np
embedding_file = "link_prediction_experiment_embeddings/SkipGram_embedding.npy"
embeddings = np.load(embedding_file)

In [162]:
node_names = list(np.array(training.nodes_reverse_mapping))

In [163]:
assert len(training.nodes_reverse_mapping) == len(embeddings)

In [164]:
assert len(training.node_types) == len(embeddings)

In [165]:
sars_cov_2_name = 'CHEMBL.TARGET:CHEMBL4303835'

In [166]:
sars_cov_2_idx = node_names.index(sars_cov_2_name)

In [167]:
drug_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:Drug'))[0])
drug_names = [node_names[i] for i in drug_idx]

In [168]:
chem_substance_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:ChemicalSubstance'))[0])
chem_substance_names = [node_names[i] for i in chem_substance_idx]

In [169]:
chembl_prefix = 'CHEMBL.COMPOUND'
chembl_names = [x for x in node_names if (match := re.compile(chembl_prefix).search(x))]
chembl_idx = [index for index, x in enumerate(node_names) if (match := re.compile(chembl_prefix).search(x))]

## Loading trained MLP models

In [170]:
from tqdm import tqdm
model_dir = "link_prediction_experiment_models"
os.makedirs(model_dir, exist_ok=True)

h5_files = ["SkipGram_weightedL2_finalized_model.h5", "SkipGram_weightedL1_finalized_model.h5", \
        "SkipGram_hadamard_finalized_model.h5", "SkipGram_average_finalized_model.h5"]
base_url = "https://zenodo.org/record/4031401/files/"

for file in tqdm(h5_files):
    with urllib.request.urlopen(base_url + file) as response, \
        open(os.path.join(model_dir, file), 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


In [171]:
import tensorflow as tf
mlp_model = ("average", tf.keras.models.load_model('link_prediction_experiment_models/SkipGram_average_finalized_model.h5'))

In [172]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

In [173]:
from embiggen import GraphTransformer, EdgeTransformer

assert(mlp_model[0] in EdgeTransformer.methods)

transformer = GraphTransformer(mlp_model[0]) # pass edge embedding method, which is mlp_model[0]
transformer.fit(embeddings)
train_edges = transformer.transform(training)
assert(training.get_edges_number() == len(train_edges))

In [193]:
# let's try to predict a link that should exist in training graph
# example SARS-CoV-2 -> ChEMBL compound edge (which should be positive)
example_chembl_edge = train_edges[training.get_edge_id(sars_cov_2_idx, chembl_idx[0])]
example_chembl_edge.shape
example_chembl_edge.__class__
#mlp_model[1].predict(example_chembl_edge)

numpy.ndarray