## Applying classifiers to make drug predictions

Here we apply embeddings generated in the NB:
`Graph embedding using SkipGram 20201012 homogeneous graph training.ipynb`
md5 hash: 261f9f7b0137263728c292a1a878d7baf3f875f3

that were used to generate link classifiers in this NB:
`Link prediction 20201012 homogeneous graph.ipynb`
md5 hash 3a13cb16b3db2a53917f7d25e16313b0fb3d411d

kg-covid-19:
version 20201012

ensmallen-graph
Version: 0.4.4 # upgraded from 0.4.3 for an update vers that has get_edge_id() and a few other methods. Otherwise 0.4.3 and 0.4.4 should be the same

embiggen
Version: 0.6.0

In [1]:
import os

w2v = "SkipGram"
exp_name = "80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001"
s3_path = "s3://kg-hub-public-data/embeddings/20201012/"  # keep trailing slash

base_dl_dir = "downloaded_data"
graph_data_dir = os.path.join(base_dl_dir, "kg-covid-19-20201012")
embedding_data_dir = os.path.join(base_dl_dir, "embeddings-20201012")
classifier_data_dir = os.path.join(base_dl_dir, "classifiers-20201012")

# graph stuff
graph_out_file = os.path.join(graph_data_dir + "/kg-covid-19.tar.gz")
nodes_file = os.path.join(graph_data_dir, "merged-kg_nodes.tsv")
edges_file = os.path.join(graph_data_dir, "merged-kg_edges.tsv")
sorted_edges_file = os.path.join(graph_data_dir, "merged-kg_edges_SORTED.tsv")
graph_tar_url = "https://kg-hub.berkeleybop.io/kg-covid-19/20201012/kg-covid-19.tar.gz"

# embeddings URLs
base_kghub_url = "http://kg-hub.berkeleybop.io/"
embeddings_url = os.path.join(base_kghub_url, "embeddings/20201012/SkipGram_80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_embedding.npy")
embedding_file = os.path.join(embedding_data_dir, "SkipGram_embedding.npy")

# classifier URLs
classifier_base_url = "http://kg-hub.berkeleybop.io/embeddings/20201012/SkipGram_80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_"
classifier_edge_models_to_use = 'average'
classifier_edge_models = ['average', 'hadamard', 'weightedL1', 'weightedL2']
classifier_suffix = '_finalized_model.h5'
classifier_urls = [f"{classifier_base_url}{m}{classifier_suffix}" for m in classifier_edge_models]
classifier_files = [f"{classifier_data_dir}/{w2v}_{exp_name}{m}{classifier_suffix}" for m in classifier_edge_models]

# params
seed = 42
train_percentage = 0.8
patience = 5

In [2]:
from pkg_resources import get_distribution
assert(get_distribution("ensmallen-graph").version == '0.4.4')  # upgraded from 0.4.3 for an update vers that has get_edge_id() and other methods
assert(get_distribution("embiggen").version == '0.6.0')
assert(get_distribution("tensorflow").version == '2.3.0')

In [3]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

## Retrieve and load graph

In [4]:
# get the graphs, if necessary

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)

if not os.path.exists(nodes_file) or not os.path.exists(edges_file):
    with urllib.request.urlopen(graph_tar_url) as response, \
        open(graph_out_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)
    os.system("tar -xvzf " + graph_out_file + " -C " + graph_data_dir)

In [5]:
%%time
from ensmallen_graph import EnsmallenGraph

if not os.path.exists(sorted_edges_file):
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path = edges_file,
        sources_column="subject",
        destinations_column="object",
        directed=False,
        node_path = nodes_file,
        nodes_column = 'id',
        node_types_column = 'category',
        default_node_type = 'biolink:NamedThing'
    )

    graph.dump_edges(sorted_edges_file,
        sources_column="subject",
        destinations_column="object")

CPU times: user 1.91 ms, sys: 2.07 ms, total: 3.98 ms
Wall time: 6.92 ms


In [6]:
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_sorted_csv(
    edge_path = sorted_edges_file,
    sources_column="subject",
    destinations_column="object",
    directed=False,
    nodes_number=500000,  # should be = or > than actual number
    edges_number=42949369,   # same ^
    node_path = nodes_file,
    nodes_column = 'id',
    node_types_column = 'category',
    default_node_type = 'biolink:NamedThing'
)

graph.report()

{'self_loops_number': '480',
 'directed': 'false',
 'has_edge_types': 'false',
 'self_loops_rate': '0.000015373068830289457',
 'degree_mean': '69.73158748096104',
 'unique_node_types_number': '42',
 'edges_number': '31223434',
 'has_weights': 'false',
 'has_node_types': 'true',
 'singletons': '8355',
 'density': '0.010673045865565448',
 'unique_edge_types_number': '0',
 'nodes_number': '447766'}

## Make same holdouts as before

In [7]:
%%time
pos_training, pos_validation = graph.connected_holdout(train_size=train_percentage, random_state=seed)

CPU times: user 40.5 s, sys: 350 ms, total: 40.9 s
Wall time: 40.7 s


The followings are check that are not necessary, but are offered as sanity checks:

In [8]:
%%time
coherence_check=True
if coherence_check:
    assert graph.contains(pos_training)
    assert graph.contains(pos_validation)
    assert (pos_training | pos_validation).contains(graph)
    assert graph.contains(pos_training | pos_validation)
    assert not pos_training.overlaps(pos_validation)
    assert not pos_validation.overlaps(pos_training)

CPU times: user 21min 11s, sys: 5.91 s, total: 21min 17s
Wall time: 4min 30s


## Loading the embeddings

In [9]:
os.makedirs(embedding_data_dir, exist_ok=True)

if not os.path.exists(embedding_file):
    with urllib.request.urlopen(embeddings_url) as response, \
        open(embedding_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

In [10]:
import numpy as np
embeddings = np.load(embedding_file)

#### More coherence checks

In [11]:
assert len(pos_training.get_nodes_reverse_mapping()) == len(embeddings)
assert len(pos_training.get_node_types()) == len(embeddings)

In [12]:
node_curies = list(np.array(pos_training.get_nodes_reverse_mapping()))

In [13]:
sars_cov_2_curie = 'NCBITaxon:2697049'
sars_cov_2_idx = node_curies.index(sars_cov_2_curie)

In [14]:
drug_idx = list(np.where(pos_training.get_node_types() == pos_training.get_node_types_reverse_mapping().index('biolink:Drug'))[0])
drug_names = [node_curies[i] for i in drug_idx]

In [15]:
chem_substance_idx = list(np.where(pos_training.get_node_types() == pos_training.get_node_types_reverse_mapping().index('biolink:ChemicalSubstance'))[0])
chem_substance_names = [node_curies[i] for i in chem_substance_idx]

In [16]:
import re
chembl_prefix = 'CHEMBL.COMPOUND'
chembl_names = [x for x in node_curies if (re.compile(chembl_prefix).search(x))]
chembl_idx = [index for index, x in enumerate(node_curies) if (re.compile(chembl_prefix).search(x))]

## Loading trained MLP models

In [17]:
os.makedirs(classifier_data_dir, exist_ok=True)

for i, url in enumerate(classifier_urls):
    with urllib.request.urlopen(url) as response, \
        open(os.path.join(classifier_files[i]), 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

In [18]:
import tensorflow as tf
mlp_model = (classifier_edge_models_to_use,
             tf.keras.models.load_model(
                 classifier_files[classifier_edge_models.index(classifier_edge_models_to_use)]
             ))

In [19]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

In [20]:
print(f"using %s model for edge embeddings" % mlp_model[0])

using average model for edge embeddings


In [21]:
dir(graph)

['__and__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'adamic_adar_index',
 'complete_walks',
 'connected_components_number',
 'connected_holdout',
 'contains',
 'cooccurence_matrix',
 'degree',
 'degrees',
 'degrees_mean',
 'degrees_median',
 'degrees_mode',
 'degrees_product',
 'disable_fast_walk',
 'drop_edge_types',
 'drop_node_types',
 'drop_weights',
 'dump_edges',
 'dump_nodes',
 'edge_types_subgraph',
 'enable_fast_walk',
 'from_sorted_csv',
 'from_unsorted_csv',
 'get_dense_node_mapping',
 'get_destination_names',
 'get_destinations',
 'get_edge_id',
 'get_edge_id_string',
 'get_edge_type',
 'get_edge_type_counts

In [None]:
from embiggen import GraphTransformer, EdgeTransformer

assert(mlp_model[0] in EdgeTransformer.methods)

transformer = GraphTransformer(mlp_model[0]) # pass edge embedding method, which is mlp_model[0]
transformer.fit(embeddings)
train_edges = transformer.transform(pos_training)
assert(pos_training.get_edges_number() == len(train_edges))

In [None]:
# let's try to predict a link that should exist in training graph
# example SARS-CoV-2 -> ChEMBL compound edge (which should bge positive)
example_chembl_edge = train_edges[pos_training.get_edge_id(sars_cov_2_idx, chembl_idx[0])]
example_chembl_edge.shape
example_chembl_edge.__class__
mlp_model[1].predict(example_chembl_edge.reshape(1, -1))

In [None]:
# prepare source (drugs) and destination (SARS-CoV-2) numpy arrays
import logging
import numpy as np
drug_idx_wo_exist_links = []
for drug_id in drug_idx:
    if not pos_training.has_edge(drug_id, sars_cov_2_idx) and not pos_training.has_edge(sars_cov_2_idx, drug_id):
        drug_idx_wo_exist_links.append(drug_id)
    else:
        logging.warning("Not using edge %s %s which was present in training graph" % (drug_id, sars_cov_2_idx))
drug_idx_wo_exist_links = np.asarray(drug_idx_wo_exist_links)
sars_cov_2_dest = np.repeat(sars_cov_2_idx, len(drug_idx_wo_exist_links))

In [None]:
# Make an edge transformer for drug -> SARS-CoV-2 edge for every member of drug_idx

from embiggen import GraphTransformer, EdgeTransformer
assert(mlp_model[0] in EdgeTransformer.methods)

edge_transformer = EdgeTransformer(mlp_model[0]) # pass edge embedding method, which is mlp_model[0]
assert(drug_idx_wo_exist_links.shape == sars_cov_2_dest.shape)
edge_transformer.fit(embeddings)
drug_edges = edge_transformer.transform(sources=drug_idx_wo_exist_links, destinations=sars_cov_2_dest)
assert(len(drug_idx_wo_exist_links) == len(drug_edges))

drug_prediction_pval = mlp_model[1].predict(drug_edges)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

plt.hist(drug_prediction_pval, density=False, bins=30)  # `density=False` would make counts
plt.ylabel('counts')
plt.xlabel('pval');

In [None]:
sort_idx = np.argsort(drug_prediction_pval, axis=0)[::-1]

with open("drug_sars_cov2_link_prediction.tsv", "w") as out:
    out.write("graph_id\tCURIE\tpval\n")
    for idx in np.nditer(sort_idx, order='F'):
        line = f"%i    %s     %f\n" % (drug_idx_wo_exist_links[idx],
                        pos_training.get_nodes_reverse_mapping()[drug_idx_wo_exist_links[idx]],
                        drug_prediction_pval[idx])
        print(line)
        out.write(line)