## Loading the graphs
We load the graph as a weighted undirected graph.

To load the graph we are using the sister package of Embiggen called [Ensmallen](https://github.com/LucaCappelletti94/ensmallen_graph). Ensmallen is a Rust library with python bindings to handle processing of graph files and preprocessing of data for quickly training embedding models.

In [1]:
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path="/home/jtr4v/merged-kg_edges.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False,
    edge_types_column="edge_label",
    default_edge_type="biolink:association",
    node_path="/home/jtr4v/merged-kg_nodes.tsv",
    nodes_column="id",
    node_types_column="category",
    default_node_type="biolink:NamedThing",
    ignore_duplicated_edges=True,
    ignore_duplicated_nodes=True,
    force_conversion_to_undirected=True
)

In [2]:
graph.report()

{'strongly_connected_components_number': '8976',
 'nodes_number': '375365',
 'unique_edge_types_number': '211',
 'unique_node_types_number': '36',
 'connected_components_number': '8976',
 'traps_rate': '0.021906677500566116',
 'degrees_mean': '82.22343319169342',
 'edges_number': '30863799',
 'degrees_min': '0',
 'singleton_nodes': '8223',
 'degrees_mode': '1',
 'selfloops_rate': '0.00001539019872440201',
 'degrees_max': '90378',
 'density': '0.00021904928054478553',
 'is_directed': 'false',
 'degrees_median': '6',
 'bidirectional_rate': '1'}

In [3]:
training, validation = graph.connected_holdout(42, 0.8)

In [4]:
training.report()

{'degrees_max': '71815',
 'is_directed': 'false',
 'strongly_connected_components_number': '8976',
 'density': '0.0001752394301136591',
 'nodes_number': '375365',
 'singleton_nodes': '8231',
 'degrees_mean': '65.77874868461365',
 'unique_node_types_number': '36',
 'selfloops_rate': '0.000015066194052579398',
 'degrees_mode': '1',
 'traps_rate': '0.021927990089646077',
 'connected_components_number': '8976',
 'bidirectional_rate': '1',
 'unique_edge_types_number': '211',
 'degrees_median': '5',
 'edges_number': '24691040',
 'degrees_min': '0'}

In [5]:
validation.report()

{'selfloops_rate': '0.000016686217621650222',
 'bidirectional_rate': '1',
 'nodes_number': '375365',
 'degrees_median': '1',
 'unique_edge_types_number': '211',
 'connected_components_number': '162436',
 'unique_node_types_number': '36',
 'degrees_mean': '16.444684507079774',
 'singleton_nodes': '158026',
 'is_directed': 'false',
 'edges_number': '6172759',
 'density': '0.00004380985043112644',
 'degrees_mode': '0',
 'degrees_min': '0',
 'traps_rate': '0.42099290024376274',
 'strongly_connected_components_number': '162436',
 'degrees_max': '18563'}

The followings are check that are not necessary, but are offered as sanity checks:

In [6]:
assert graph > training
assert graph > validation

## Loading the embeddings

In [7]:
import numpy as np

embeddings = np.load("/home/jtr4v/SkipGram_embedding.npy")

In [8]:
node_names = list(np.array(training.nodes_reverse_mapping))

In [9]:
assert len(training.nodes_reverse_mapping) == len(embeddings)

In [10]:
## Rank ChEMBL antivirals by cosine sim to SARS-CoV-2

In [11]:
sars_cov_2_name = 'CHEMBL.TARGET:CHEMBL4303835'

In [12]:
sars_cov_2_idx = node_names.index(sars_cov_2_name)

In [13]:
with open('/home/jtr4v/sars_cov_2_compounds.txt') as f:
    chembl_antiviral_names = f.read().splitlines()

In [14]:
chembl_antiviral_idx = [node_names.index(av) for av in chembl_antiviral_names]

In [15]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]
sars_cov_2_emb.shape

antiviral_emb = embeddings[chembl_antiviral_idx[0]]
antiviral_emb.shape

(100,)

In [16]:
from scipy import spatial
1 - spatial.distance.cosine(antiviral_emb, sars_cov_2_emb)

0.9491932392120361

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

chembl_antiviral_cosine_sim = []

sars_cov_2_emb = embeddings[sars_cov_2_idx]

for antiviral_idx in tqdm(chembl_antiviral_idx):
    antiviral_emb = embeddings[antiviral_idx]
    chembl_antiviral_cosine_sim.append(1 - spatial.distance.cosine(antiviral_emb, sars_cov_2_emb))

100%|██████████| 6900/6900 [00:00<00:00, 19721.91it/s]


In [18]:
chembl_antiviral_names_ranked = [[x,y] for y, x in sorted(zip(chembl_antiviral_cosine_sim,chembl_antiviral_names), key=lambda pair: pair[0], reverse=True)]

In [31]:
chembl_antiviral_names_ranked[0:10]

[['CHEMBL.COMPOUND:CHEMBL58', 0.9827765822410583],
 ['CHEMBL.COMPOUND:CHEMBL289277', 0.9827749133110046],
 ['CHEMBL.COMPOUND:CHEMBL297453', 0.9827325344085693],
 ['CHEMBL.COMPOUND:CHEMBL1484', 0.9821076989173889],
 ['CHEMBL.COMPOUND:CHEMBL205596', 0.981408417224884],
 ['CHEMBL.COMPOUND:CHEMBL226335', 0.9810521602630615],
 ['CHEMBL.COMPOUND:CHEMBL224867', 0.980166494846344],
 ['CHEMBL.COMPOUND:CHEMBL145', 0.9799341559410095],
 ['CHEMBL.COMPOUND:CHEMBL1464', 0.9798558950424194],
 ['CHEMBL.COMPOUND:CHEMBL58510', 0.9797195196151733]]

In [20]:
## Rank all drugs by cosine sim to SARS-CoV-2

In [21]:
assert len(training.node_types) == len(embeddings)

In [22]:
drug_idx = list(np.where(training.node_types == training.node_types_reverse_mapping.index('biolink:Drug'))[0])

In [23]:
drug_names = [node_names[i] for i in drug_idx]

In [24]:
drug_cosine_sim = []

sars_cov_2_emb = embeddings[sars_cov_2_idx]

for drug_i in tqdm(drug_idx):
    drug_emb = embeddings[drug_i]
    drug_cosine_sim.append(1 - spatial.distance.cosine(drug_emb, sars_cov_2_emb))

100%|██████████| 32109/32109 [00:01<00:00, 19558.92it/s]


In [25]:
drugs_ranked = [[x,y] for y, x in sorted(zip(drug_cosine_sim,drug_names), key=lambda pair: pair[0], reverse=True)]

In [26]:
drugs_ranked[0:5]

[['ttd.drug:D0A0UL', 0.968268096446991],
 ['ttd.drug:D0K0KR', 0.9675211906433105],
 ['ttd.drug:D0GP1I', 0.9673413038253784],
 ['ttd.drug:D0MM7G', 0.9671435356140137],
 ['ttd.drug:D03ERS', 0.9667762517929077]]

In [27]:
## Sort everything by closeness to SARS-CoV-2

# Sort all the things

In [28]:
sars_cov_2_emb = embeddings[sars_cov_2_idx]

node_sim = []

for idx in tqdm(range(len(node_names))):
    this_emb = embeddings[idx]
    node_sim.append(1 - spatial.distance.cosine(this_emb, sars_cov_2_emb))

100%|██████████| 375365/375365 [00:18<00:00, 19760.77it/s]


In [29]:
everything_ranked = [[x,y] for y, x in sorted(zip(node_sim,node_names), key=lambda pair: pair[0], reverse=True)]

In [30]:
everything_ranked[0:10]

[['CHEMBL.TARGET:CHEMBL4303835', 1.0],
 ['CORD:26762988147d906e873f16c6e03cbb2ad7c46ea1', 0.9859899878501892],
 ['CORD:19e18a7c27512439dac91fc1096937fd03496c13', 0.9857174754142761],
 ['MESH:D018883', 0.9851577877998352],
 ['CORD:e1d54d1ec033522012a4f994961fec6582c17ce6', 0.9846948385238647],
 ['MESH:D008570', 0.9845601320266724],
 ['CORD:PMC7088121', 0.984553337097168],
 ['CORD:b96d1a769045501e7ba79edf624608b6f2fe5b95', 0.9840669631958008],
 ['CORD:PMC7169570', 0.9838549494743347],
 ['CORD:PMC1592083', 0.9837960004806519]]