## Loading the graphs
We load the graph as a weighted undirected graph.

To load the graph we are using the sister package of Embiggen called [Ensmallen](https://github.com/LucaCappelletti94/ensmallen_graph). Ensmallen is a Rust library with python bindings to handle processing of graph files and preprocessing of data for quickly training embedding models.

In [1]:
graph_data_dir = "graph"

In [2]:
import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)
if not os.path.exists(os.path.join(graph_data_dir, "merged-kg_edges.tsv")) or \
    not os.path.exists(os.path.join(graph_data_dir, "merged-kg_nodes.tsv")):
    with urllib.request.urlopen("https://zenodo.org/record/4011267/files/kg-covid-19-skipgram-aug-2020.tar.gz") as response, \
        open(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

os.system("tar -xvzf " + graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz -C " + graph_data_dir)

0

In [5]:
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path=os.path.join(graph_data_dir, "merged-kg_edges.tsv"),
    sources_column="subject",
    destinations_column="object",
    directed=False,
    edge_types_column="edge_label",
    default_edge_type="biolink:association",
    node_path=os.path.join(graph_data_dir, "merged-kg_nodes.tsv"),
    nodes_column="id",
    node_types_column="category",
    default_node_type="biolink:NamedThing",
    ignore_duplicated_edges=True,
    ignore_duplicated_nodes=True,
    force_conversion_to_undirected=True
)

In [6]:
graph.report()

{'connected_components_number': '8976',
 'degrees_max': '90378',
 'degrees_min': '0',
 'nodes_number': '375365',
 'singleton_nodes': '8223',
 'traps_rate': '0.021906677500566116',
 'degrees_mode': '1',
 'strongly_connected_components_number': '8976',
 'degrees_mean': '82.22343319169342',
 'unique_edge_types_number': '211',
 'edges_number': '30863799',
 'density': '0.00021904928054478553',
 'bidirectional_rate': '1',
 'selfloops_rate': '0.00001539019872440201',
 'is_directed': 'false',
 'unique_node_types_number': '36',
 'degrees_median': '6'}

In [7]:
training, validation = graph.connected_holdout(42, 0.8)

In [8]:
training.report()

{'traps_rate': '0.021927990089646077',
 'strongly_connected_components_number': '8976',
 'degrees_median': '5',
 'nodes_number': '375365',
 'edges_number': '24691040',
 'singleton_nodes': '8231',
 'degrees_mean': '65.77874868461365',
 'unique_edge_types_number': '211',
 'connected_components_number': '8976',
 'unique_node_types_number': '36',
 'selfloops_rate': '0.000015066194052579398',
 'density': '0.0001752394301136591',
 'degrees_max': '71815',
 'is_directed': 'false',
 'degrees_mode': '1',
 'bidirectional_rate': '1',
 'degrees_min': '0'}

In [9]:
validation.report()

{'density': '0.00004380985043112644',
 'degrees_mean': '16.444684507079774',
 'nodes_number': '375365',
 'degrees_mode': '0',
 'traps_rate': '0.42099290024376274',
 'bidirectional_rate': '1',
 'unique_node_types_number': '36',
 'connected_components_number': '162436',
 'selfloops_rate': '0.000016686217621650222',
 'strongly_connected_components_number': '162436',
 'degrees_max': '18563',
 'degrees_min': '0',
 'edges_number': '6172759',
 'unique_edge_types_number': '211',
 'singleton_nodes': '158026',
 'degrees_median': '1',
 'is_directed': 'false'}

The followings are check that are not necessary, but are offered as sanity checks:

In [10]:
assert graph > training
assert graph > validation

## Loading the embeddings

In [13]:
# https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy?download=1
embedding_dir = "embeddings"
embedding_file = os.path.join(embedding_dir, "SkipGram_embedding.npy")
os.makedirs(embedding_dir, exist_ok=True)

if not os.path.exists(embedding_file):
    with urllib.request.urlopen("https://zenodo.org/record/4019808/files/SkipGram_80_20_training_test_epoch_500_delta_0.0001_embedding.npy") as response, \
        open(embedding_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

In [14]:
import numpy as np

embeddings = np.load(embedding_file)

In [15]:
node_names = list(np.array(training.nodes_reverse_mapping))

In [16]:
assert len(training.nodes_reverse_mapping) == len(embeddings)

In [17]:
## Rank ChEMBL antivirals by cosine sim to SARS-CoV-2

In [18]:
sars_cov_2_name = 'CHEMBL.TARGET:CHEMBL4303835'

In [19]:
sars_cov_2_idx = node_names.index(sars_cov_2_name)

In [38]:
compounds_dir = "compounds"
compounds_file = os.path.join(compounds_dir, "sars_cov_2_compounds.txt")
os.makedirs(compounds_dir, exist_ok=True)

if not os.path.exists(os.path.join(compounds_file)):
    with urllib.request.urlopen("https://kg-hub.berkeleybop.io/kg-covid-19/20201001/transformed/ChEMBL/nodes.tsv") as response, \
        open("nodes.txt", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)
            # sorry:
            os.system(f"cut -f 11 nodes.tsv | grep CHEMBL.COMPOUND: | sort | uniq > %s" % compounds_file)

In [None]:
with open(compounds_file) as f:
    chembl_antiviral_names = f.read().splitlines()

In [40]:
chembl_antiviral_idx = [node_names.index(av) for av in chembl_antiviral_names]

In [41]:
chembl_antiviral_idx = [node_names.index(av) for av in chembl_antiviral_names]

In [42]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]
sars_cov_2_emb.shape

antiviral_emb = embeddings[chembl_antiviral_idx[0]]
antiviral_emb.shape

(100,)

In [43]:
from scipy import spatial
1 - spatial.distance.cosine(antiviral_emb, sars_cov_2_emb)

0.9501006603240967

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

chembl_antiviral_cosine_sim = []

sars_cov_2_emb = embeddings[sars_cov_2_idx]

for antiviral_idx in tqdm(chembl_antiviral_idx):
    antiviral_emb = embeddings[antiviral_idx]
    chembl_antiviral_cosine_sim.append(1 - spatial.distance.cosine(antiviral_emb, sars_cov_2_emb))

100%|██████████| 6900/6900 [00:00<00:00, 7126.65it/s]


In [45]:
chembl_antiviral_names_ranked = [[x,y] for y, x in sorted(zip(chembl_antiviral_cosine_sim,chembl_antiviral_names), key=lambda pair: pair[0], reverse=True)]

In [46]:
import csv

with open("chembl_antiviral_names_ranked.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(chembl_antiviral_names_ranked)

In [47]:
chembl_antiviral_names_ranked

[['CHEMBL.COMPOUND:CHEMBL196', 0.9582940340042114],
 ['CHEMBL.COMPOUND:CHEMBL599', 0.9576287865638733],
 ['CHEMBL.COMPOUND:CHEMBL86', 0.95735102891922],
 ['CHEMBL.COMPOUND:CHEMBL123292', 0.9570654630661011],
 ['CHEMBL.COMPOUND:CHEMBL1451', 0.9558048844337463],
 ['CHEMBL.COMPOUND:CHEMBL1200907', 0.9555789232254028],
 ['CHEMBL.COMPOUND:CHEMBL33', 0.9554030895233154],
 ['CHEMBL.COMPOUND:CHEMBL660', 0.9553320407867432],
 ['CHEMBL.COMPOUND:CHEMBL575060', 0.9551365971565247],
 ['CHEMBL.COMPOUND:CHEMBL932', 0.9549105763435364],
 ['CHEMBL.COMPOUND:CHEMBL1496', 0.9549060463905334],
 ['CHEMBL.COMPOUND:CHEMBL1485', 0.9540911912918091],
 ['CHEMBL.COMPOUND:CHEMBL701', 0.9540061354637146],
 ['CHEMBL.COMPOUND:CHEMBL1201774', 0.9539457559585571],
 ['CHEMBL.COMPOUND:CHEMBL633', 0.9536402225494385],
 ['CHEMBL.COMPOUND:CHEMBL1046', 0.9536309838294983],
 ['CHEMBL.COMPOUND:CHEMBL477197', 0.9535964727401733],
 ['CHEMBL.COMPOUND:CHEMBL1747', 0.9535775184631348],
 ['CHEMBL.COMPOUND:CHEMBL50', 0.95353138446807

In [48]:
chembl_antiviral_names_ranked[0:10]

[['CHEMBL.COMPOUND:CHEMBL196', 0.9582940340042114],
 ['CHEMBL.COMPOUND:CHEMBL599', 0.9576287865638733],
 ['CHEMBL.COMPOUND:CHEMBL86', 0.95735102891922],
 ['CHEMBL.COMPOUND:CHEMBL123292', 0.9570654630661011],
 ['CHEMBL.COMPOUND:CHEMBL1451', 0.9558048844337463],
 ['CHEMBL.COMPOUND:CHEMBL1200907', 0.9555789232254028],
 ['CHEMBL.COMPOUND:CHEMBL33', 0.9554030895233154],
 ['CHEMBL.COMPOUND:CHEMBL660', 0.9553320407867432],
 ['CHEMBL.COMPOUND:CHEMBL575060', 0.9551365971565247],
 ['CHEMBL.COMPOUND:CHEMBL932', 0.9549105763435364]]

In [49]:
## Rank all drugs by cosine sim to SARS-CoV-2

In [50]:
assert len(training.node_types) == len(embeddings)

In [51]:
drug_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:Drug'))[0])

In [52]:
drug_names = [node_names[i] for i in drug_idx]

In [53]:
drug_cosine_sim = []

sars_cov_2_emb = embeddings[sars_cov_2_idx]

for drug_i in tqdm(drug_idx):
    drug_emb = embeddings[drug_i]
    drug_cosine_sim.append(1 - spatial.distance.cosine(drug_emb, sars_cov_2_emb))

100%|██████████| 32109/32109 [00:02<00:00, 10811.50it/s]


In [54]:
drugs_ranked = [[x,y] for y, x in sorted(zip(drug_cosine_sim,drug_names), key=lambda pair: pair[0], reverse=True)]

In [55]:
drugs_ranked[0:5]

[['ttd.drug:D0N0ME', 0.9533385634422302],
 ['ttd.drug:D0Q3KF', 0.9502077102661133],
 ['ttd.drug:D01ERB', 0.9489859342575073],
 ['ttd.drug:D03RFA', 0.9487559795379639],
 ['ttd.drug:D05AHE', 0.9484182596206665]]

In [56]:
## Sort everything by closeness to SARS-CoV-2

# Sort all the things

In [57]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

node_sim = []

for idx in tqdm(range(len(node_names))):
    this_emb = embeddings[idx]
    node_sim.append(1 - spatial.distance.cosine(this_emb, sars_cov_2_emb))

100%|██████████| 375365/375365 [00:22<00:00, 16354.19it/s]


In [58]:
everything_ranked = [[x,y] for y, x in sorted(zip(node_sim,node_names), key=lambda pair: pair[0], reverse=True)]

In [59]:
everything_ranked[0:100]

[['CHEMBL.TARGET:CHEMBL4303835', 1.0],
 ['CORD:b95339c95424c54193d8b0d9e5a6cfbfb7063a79', 0.9867148995399475],
 ['CORD:PMC7230702', 0.9864558577537537],
 ['CORD:PMC7115643', 0.9864391088485718],
 ['CORD:d5beda3a26be237596283be8dd0f79bf29329ad7', 0.9863427877426147],
 ['CORD:PMC7122824', 0.9863361120223999],
 ['CORD:7250e002968b1de78d6166e393226dbd2e092a04', 0.9862306118011475],
 ['CORD:857137889eef45edb1a66c1950f0e0ca27ad63a4', 0.9861706495285034],
 ['CORD:4583a977a2dd4f0b642c7d663d8915df6f947d2c', 0.9860418438911438],
 ['CORD:30feb1888b0a58c03c2f634eb1d55b9de4d31972', 0.986039400100708],
 ['CORD:PMC7129154', 0.9859709739685059],
 ['CORD:PMC7291048', 0.9858973026275635],
 ['CORD:PMC7122656', 0.9858143925666809],
 ['CORD:b143341a359107235fa224d35a28e1740c74c4c4', 0.9858080148696899],
 ['CORD:0d91c60bbed77db2e0060297cc87d2067ce09fe1', 0.9857782125473022],
 ['CORD:PMC7164072', 0.9857408404350281],
 ['CORD:PMC7121112', 0.9857394099235535],
 ['CORD:PMC7164056', 0.9856913089752197],
 ['CORD: