# Graph embedding using SkipGram

This is an embedding of the whole graph, 80/20 training and validation split and all sources

kg-covid-19:
version 20201012

Name: ensmallen-graph
Version: 0.4.4

Name: embiggen
Version: 0.6.0

In [6]:
from pkg_resources import get_distribution
assert(get_distribution("ensmallen-graph").version == '0.4.4')  # identical to 0.4.3 except for addition of some methods like get_edge_id()
assert(get_distribution("embiggen").version == '0.6.0')

In [7]:
import os

exp_name = "80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_updated_holdouts"
s3_path = "s3://kg-hub-public-data/embeddings/20201012/"  # keep trailing slash
base_url = "https://kg-hub.berkeleybop.io/embeddings/20201012/"

base_dl_dir = "downloaded_data"
graph_data_dir = os.path.join(base_dl_dir, "kg-covid-19-20201012")
embedding_data_dir = os.path.join(base_dl_dir, "embeddings-20201012")
pos_neg_data_dir = os.path.join(base_dl_dir, "pos_neg_data_dir")

# graph stuff
graph_out_file = os.path.join(graph_data_dir + "/kg-covid-19.tar.gz")
nodes_file = os.path.join(graph_data_dir, "merged-kg_nodes.tsv")
edges_file = os.path.join(graph_data_dir, "merged-kg_edges.tsv")
sorted_edges_file = os.path.join(graph_data_dir, "merged-kg_edges_SORTED.tsv")
graph_tar_url = "https://kg-hub.berkeleybop.io/kg-covid-19/20201012/kg-covid-19.tar.gz"

# embeddings URLs
base_kghub_url = "http://kg-hub.berkeleybop.io/"
embeddings_url = os.path.join(base_kghub_url, "embeddings/20201012/SkipGram_80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_embedding.npy")
embedding_file = os.path.join(embedding_data_dir, "SkipGram_embedding.npy")

# pos/neg nodes for better pos/neg edge set
pos_node_url = os.path.join(base_url, "positive_nodes.tsv")
pos_node_file = os.path.join(pos_neg_data_dir, "positive_nodes.tsv")
neg_node_url = os.path.join(base_url, "negative_nodes.tsv")
neg_node_file = os.path.join(pos_neg_data_dir, "negative_nodes.tsv")


# params
seed = 42
train_percentage = 0.8

In [8]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

### Load the positive and negative nodes

In [9]:
import urllib
import os
import pandas as pd

os.makedirs(pos_neg_data_dir, exist_ok=True)

if not os.path.exists(pos_node_file):
    with urllib.request.urlopen(pos_node_url) as response, \
        open(pos_node_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

if not os.path.exists(neg_node_file):
    with urllib.request.urlopen(neg_node_url) as response, \
        open(neg_node_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)
            
positive_nodes = pd.read_csv(pos_node_file, "\t", header=0, comment='#')
negative_nodes = pd.read_csv(neg_node_file, "\t", header=0, comment='#')

## Loading the graphs
We load the kg-covid-19 graph from the repository as an undirected graph.

In [10]:
# get the graphs, if necessary

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)

if not os.path.exists(nodes_file) or not os.path.exists(edges_file):
    with urllib.request.urlopen(graph_tar_url) as response, \
        open(graph_out_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)
    os.system("tar -xvzf " + graph_out_file + " -C " + graph_data_dir)

#### only need to do this once, b/c we'll load the sorted.tsv from now on once it is made below

In [11]:
sars_cov_2_curie = 'NCBITaxon:2697049'
chembl_prefix = 'CHEMBL.COMPOUND'

In [12]:
import pandas as pd
import os

new_edge_file = os.path.join(graph_data_dir, 'edges_with_holdout_column.tsv')

if not os.path.exists(new_edge_file): 
    edges = pd.read_csv(
        graph_data_dir + "/merged-kg_edges.tsv",
        sep="\t",
        usecols=[1,3],
        dtype={'subject': str, 'object': str}
    )
    
    pos_node_curies_set = set(positive_nodes.curie)

    chembl_to_sars_cov_2_edges = (
        (edges.subject.str.contains(chembl_prefix) & (edges.object.isin(pos_node_curies_set))) | 
        (edges.object.str.contains(chembl_prefix) & (edges.subject.isin(pos_node_curies_set)))
    )

    edges['holdout_edge_label'] = [
        'chembl_to_sars_cov_2' if value else 'normal'
        for value in chembl_to_sars_cov_2_edges]


    edges.to_csv(new_edge_file, sep="\t", index=False)

In [13]:
from ensmallen_graph import EnsmallenGraph
graph = EnsmallenGraph.from_unsorted_csv(
    name="kg-covid-19",
    edge_path = new_edge_file,
    sources_column="subject",
    destinations_column="object",
    edge_types_column='holdout_edge_label',
    directed=False,
    node_path = graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column = 'id',
    node_types_column = 'category',
    default_node_type = 'biolink:NamedThing'
)

In [14]:
graph

The undirected graph kg-covid-19 has 447766 nodes with 42 different node types:  the 5 most common are biolink:Publication (nodes number 129930), biolink:OntologyClass (nodes number 108266), biolink:Drug (nodes number 32120), biolink:ChemicalSubstance (nodes number 27157) and biolink:Disease (nodes number 24236), of which 8355 are singletons, and 15611957 unweighted edges with 2 different edge types: normal and chembl_to_sars_cov_2, of which 480 are self-loops. The graph is quite sparse as it has a density of 0.00016 and has 9107 connected components, where the component with most nodes has 435728 nodes and the component with the least nodes has 1 nodes. The graph median node degree is 4, the mean node degree is 69.73 and the node degree mode is 1. The top 5 most central nodes are MESH:D014780 (degree 90378), MESH:D006801 (degree 78249), WD:Q30 (degree 65223), MESH:D014777 (degree 54155) and MESH:D017934 (degree 45196).

In [15]:
reduced_graph = graph.remove_components(edge_types=['chembl_to_sars_cov_2'])
reduced_graph = reduced_graph.remove(singletons=True)

In [16]:
reduced_graph

The undirected graph kg-covid-19 has 435728 nodes with 40 different node types:  the 5 most common are biolink:Publication (nodes number 129492), biolink:OntologyClass (nodes number 104951), biolink:Drug (nodes number 32016), biolink:ChemicalSubstance (nodes number 27152) and biolink:Disease (nodes number 22281) and 15608444 unweighted edges with 2 different edge types: normal and chembl_to_sars_cov_2, of which 314 are self-loops. The graph is quite sparse as it has a density of 0.00016 and is connected, as it has a single component. The graph median node degree is 4, the mean node degree is 71.64 and the node degree mode is 1. The top 5 most central nodes are MESH:D014780 (degree 90378), MESH:D006801 (degree 78249), WD:Q30 (degree 65223), MESH:D014777 (degree 54155) and MESH:D017934 (degree 45196).

## Creating the SkipGram model
We are going to setup the model to use, if available, multiple GPUs.

In [26]:
from cache_decorator import Cache
from tensorflow.distribute import MirroredStrategy
from tensorflow.keras.optimizers import Nadam
from embiggen import SkipGram
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from embiggen import Node2VecSequence


@Cache(
    cache_path="{cache_dir}/SkipGram/{_hash}_{holdout_idx}.csv.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=['train_graph']
)
def compute_skipgram_embedding(
    train_graph: EnsmallenGraph,
    holdout_idx: int,
    walk_length: int = 100,
    batch_size: int = 2**9,
    iterations: int = 20,
    return_weight: float = 1.0,
    explore_weight: float = 1.0,
    embedding_size: int = 100,
    change_edge_type_weight: float = 1.0,
    window_size: int = 4,
    negative_samples: int = 7,
    patience: int = 6,
    delta: float = 0.1,
    epochs: int = 500
):
    """Return dataframe with node embedding obtained with SkipGram for train_graph
    
    Given a graph, learn embeddings and return dataframe with node embeddings
    
    Parameters
    -----
    train_graph: EnsmallenGraph
    holdout_idx: int, 
        an int to identify the holdout
    walk_length: int = 100,
        how many nodes for each walk
    batch_size: int = 2**9,
        how many walks for each batch
    iterations: int = 20,
        how many walks per node
    return_weight: float = 1.0,
        node2vec param, equal to 1/p
    explore_weight: float = 1.0,
        node2vec param, equal to 1/q
    embedding_size: int = 100,
        dimensions for embedding
    window_size: int = 4,
        SkipGram window size
    negative_samples: int = 7,
        how many negative samples that NCE function needs to sample
    patience: int = 6,
        how many epochs to wait for loss fxn to improve by [delta]
    delta: float = 0.1
        change in loss fxn to be considered an improvement
        
    Return:
    -------
    pd.DataFrame containing an embedding for each node in train_graph
    """
    training_sequence = Node2VecSequence(
        train_graph,
        walk_length=walk_length,
        batch_size=batch_size,
        iterations=iterations,
        window_size=window_size,
        return_weight=return_weight,
        explore_weight=explore_weight,
        change_edge_type_weight=change_edge_type_weight,
        support_mirror_strategy=True
    )

    strategy = MirroredStrategy()
    with strategy.scope():
        model = SkipGram(
            vocabulary_size=train_graph.get_nodes_number(),
            embedding_size=embedding_size,
            window_size=window_size,
            negative_samples=negative_samples,
        )

    history = model.fit(
        training_sequence,
        steps_per_epoch=training_sequence.steps_per_epoch,
        epochs=epochs,
        callbacks=[
            EarlyStopping(
                "loss",
                min_delta=delta,
                patience=patience,
                restore_best_weights=True
            ),
            ReduceLROnPlateau(
                monitor="loss",
                patience=patience//2,
                min_delta=delta
            )
        ]
    )
    return model.get_embedding_dataframe(train_graph.get_nodes_reverse_mapping())

# Link prediction models

In [27]:
from typing import List, Dict, Tuple
import numpy as np

def create_ranking(edges: List[Tuple[str, str]], predictions: np.ndarray) -> Dict:
    """Return ranking of edges and predictions.

    Parameters
    ---------------------
    edges: List[Tuple[str, str]],
        Edges to be predicted.
    predictions: np.ndarray,
        Predictions of the model.

    Returns
    ---------------------
    Dictionary of the ranking.
    """
    return dict(zip(edges, predictions))

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.metrics import AUC, Recall, Precision
from typing import Dict

@Cache(    
    cache_path="{cache_dir}/model_predictions/mlp/{embedding_model}/{holdout}_{_hash}.pkl.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges", "batch_size"]
)
def mlp(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]],
    epochs: int = 500,
    batch_size: int = 256,
    patience: int = 10,
    min_delta: float = 0.000001,
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return random forest predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = Sequential([
        Input(train_X.shape[1:]),
        Dense(64, activation="relu"),
        Dense(32, activation="relu",
              activity_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.3),
        Dense(8, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=[
            AUC(curve="PR", name="auprc"),
            AUC(curve="ROC", name="auroc"),
            Recall(name="Recall"),
            Precision(name="Precision"),            
            "accuracy"
        ]
    )
    model.fit(
        train_X, train_y,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[
            EarlyStopping("loss", patience=patience, min_delta=min_delta)
        ],
        verbose=False
    )
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X).flatten()
    return train_pred, test_pred, create_ranking(
        edges, model.predict(graph_test_X).flatten()
    )

In [29]:
"""Submodule offering method to execute cached random forest."""
from typing import Tuple, Dict, List
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import cpu_count
from cache_decorator import Cache

@Cache(    
    cache_path="{cache_dir}/model_predictions/random_forest/{embedding_model}/{holdout}_{_hash}.pkl.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def random_forest(
    holdout: int,
    edge_embedding_method: str,
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return random forest predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        n_jobs=cpu_count(),
        class_weight="balanced_subsample",
        max_samples=0.5
    )
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])

In [30]:
from typing import Tuple, Dict, List
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from cache_decorator import Cache

@Cache(
    cache_path="{cache_dir}/model_predictions/decision_tree/{embedding_model}/{holdout}_{_hash}.pkl.gz",    
    cache_dir=embedding_data_dir,    
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def decision_tree(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return decision tree predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = DecisionTreeClassifier(max_depth=10, class_weight="balanced")
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])


In [31]:
from typing import Tuple, Dict, List
import numpy as np
from sklearn.linear_model import LogisticRegression
from cache_decorator import Cache

@Cache(
    cache_path="{cache_dir}/model_predictions/logistic_regression/{embedding_model}/{holdout}_{_hash}.pkl.gz",    
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def logistic_regression(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, LogisticRegression]:
    """Return logistic regression predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = LogisticRegression(class_weight="balanced", max_iter=1000)
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])


In [32]:
nodes_of_interest = ['biolink:Drug',
                     'biolink:Drug|biolink:ChemicalSubstance',
                     'biolink:ChemicalSubstance']
# graph_drugs_only = reduced_graph.remove(allow_node_types_set=set(nodes_of_interest))
# graph_drugs_only

In [33]:
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, recall_score, precision_score
from sanitize_ml_labels import sanitize_ml_labels
from typing import Dict
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

def specificity_score(y_true: np.ndarray, y_pred: np.ndarray):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

def get_metrics_report(y_true: np.ndarray, y_pred: np.ndarray)->Dict[str, float]:
    float_metrics = (average_precision_score, roc_auc_score)
    integer_metrics = (accuracy_score, recall_score, precision_score, specificity_score, f1_score)
    integer_y_pred = y_pred.round().astype(int)

    return {
        **{
            sanitize_ml_labels(metric.__name__): metric(y_true, integer_y_pred)
            for metric in integer_metrics
        },
        **{
            sanitize_ml_labels(metric.__name__): metric(y_true, y_pred)
            for metric in float_metrics
        }
    }


#### Get pos/neg node files (if necessary), and load from tsv

In [None]:
# make holdouts
from tqdm.auto import trange, tqdm
from embiggen import EdgeTransformer, GraphTransformer, LinkPredictionTransformer

link_prediction_models = [
    mlp, 
    logistic_regression,
    random_forest,
    decision_tree
]

rankings = {}
results = []

for holdout in trange(5, desc="computing embeddings"): # TODO: increase holdout to 15-20
    
    pos_training, pos_validation = reduced_graph.connected_holdout(
        train_size=train_percentage, 
        edge_types=['chembl_to_sars_cov_2'],
        random_state=seed + holdout)        

    pos_training.enable_fast_walk()
    embedding = compute_skipgram_embedding(pos_training, holdout)    
    
    continue
    
    sars_cov_2_embed = embeddings[sars_cov_2_curie]
    break
    
    for edge_embedding_method in tqdm(EdgeTransformer.methods,
                                      desc="edge embeddings",
                                      leave=False):
        rankings[edge_embedding_method] = rankings.get(edge_embedding_method, {})

        graph_transformer = GraphTransformer(method=edge_embedding_method)
        graph_transformer.fit(embedding)
        graph_test_X = graph_transformer.transform(pos_validation)

        transformer = LinkPredictionTransformer(method=edge_embedding_method)
        transformer.fit(embedding)
        train_X, train_y = transformer.transform(
            pos_training.filter(edge_types=['chembl_to_sars_cov_2']), 
            neg_training
        )
        test_X, test_y = transformer.transform(pos_validation, neg_validation)
    
        for link_prediction_model in tqdm(link_prediction_models, desc="models", leave=False):
            rankings[edge_embedding_method][link_prediction_model.__name__] = rankings[edge_embedding_method].get(link_prediction_model.__name__, [])            
            train_pred, test_pred, link_prediction_ranking = link_prediction_model(
                holdout=holdout,
                edge_embedding_method=edge_embedding_method,
                embedding_model='SkipGram',
                train_X=train_X,
                train_y=train_y,
                test_X=test_X,
                graph_test_X=graph_test_X,
                edges=pos_validation.get_edge_names()
            )
            rankings[edge_embedding_method][link_prediction_model.__name__].append(link_prediction_ranking)
            
            for run, true, predictions in (
                ("train", train_y, train_pred),
                ("test", test_y, test_pred)
            ):
                results.append({
                    "run": run,
                    "model": link_prediction_model.__name__,
                    "edge_embedding_method": edge_embedding_method,
                    "holdout_number": holdout,
                    **get_metrics_report(true, predictions)
                })

HBox(children=(FloatProgress(value=0.0, description='computing embeddings', max=5.0, style=ProgressStyle(descr…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500

## Upload the weights and embeddings

In [22]:
rankings

{'hadamard': {'mlp_cached': [{('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL1200692'): 0.9998072,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL365739'): 0.99999404,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL113'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL231779'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL529'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL600'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL389621'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1643'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL398440'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL790'): 0.9999548,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL165'): 0.99999654,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL267345'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL324842'): 1.0,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL278041'): 0.9999999,
    ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL590799')

In [23]:
def parse_holdouts_ranking(holdouts_ranking: Dict) -> Dict:
    dict_result = {}
    for ranking in holdouts_ranking:
        for edge, value in ranking.items():
            dict_result[edge] = dict_result.get(edge, []) + [value]
    return {
        edge: np.mean(values)
        for edge, values in dict_result.items()
    }

In [24]:
cumul_ranking = {
    method: {
        model: parse_holdouts_ranking(holdouts_ranking)
        for model, holdouts_ranking in models_ranking.items()
    }
    for method, models_ranking in rankings.items()
}

In [25]:
cumul_ranking

{'hadamard': {'mlp_cached': {('NCBITaxon:2697049',
    'CHEMBL.COMPOUND:CHEMBL1200692'): 0.99990356,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL365739'): 0.98569524,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL113'): 0.8659186,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL231779'): 0.7710225,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL529'): 0.64331174,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL600'): 0.7276993,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL389621'): 0.64331174,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1643'): 0.6528678,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL398440'): 0.99999976,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL790'): 0.72491956,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL165'): 0.656532,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL267345'): 0.6344971,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL324842'): 0.43634558,
   ('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL278041'): 0.65286773,
   ('NCBITaxo

In [26]:
sorted(cumul_ranking['hadamard']['mlp_cached'].items(), key = lambda x: x[1], reverse=True)

[(('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1433'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL64925'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1539'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL856'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL226335'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL254316'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL212301'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL141'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL22'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL858'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL428496'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL463210'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1581'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL1519'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL277474'), 1.0),
 (('NCBITaxon:2697049', 'CHEMBL.COMPOUND:CHEMBL397983'), 1.0),
 (('NCBITaxon:

In [27]:
bp_graph = reduced_graph.get_bipartite_edge_names(
    first_node_types_set=set(nodes_of_interest),
    second_nodes_set=set([sars_cov_2_curie])
)

In [28]:
reduced_graph

The undirected graph kg-covid-19 has 435728 nodes with 40 different node types:  the 5 most common are biolink:Publication (nodes number 129492), biolink:OntologyClass (nodes number 104951), biolink:Drug (nodes number 32016), biolink:ChemicalSubstance (nodes number 27152) and biolink:Disease (nodes number 22281) and 15608444 unweighted edges with 2 different edge types: normal and chembl_to_sars_cov_2, of which 314 are self-loops. The graph is quite sparse as it has a density of 0.00016 and is connected, as it has a single component. The graph median node degree is 4, the mean node degree is 71.64 and the node degree mode is 1. The top 5 most central nodes are MESH:D014780 (degree 90378), MESH:D006801 (degree 78249), WD:Q30 (degree 65223), MESH:D014777 (degree 54155) and MESH:D017934 (degree 45196).

In [29]:
reduced_graph.get_node_count_by_node_type_name(nodes_of_interest[1])

130

In [30]:
reduced_graph.get_node_count_by_node_type_name('biolink:Drug|biolink:ChemicalSubstance')

130

In [31]:
len(bp_graph)

55172

In [None]:
from barplots import barplots

results_pd = pd.DataFrame(results)


def sort_bars(df:pd.DataFrame)->pd.DataFrame:
    return df.sort_values(["model", "run"], ascending=[True, False])

custom_defaults={
    "Decision Tree": "decision_tree_cached",
    "Logistic Regression": "logistic_regression_cached",
    "Random Forest": "random_forest_cached"
}

figures = barplots(
    results_pd.drop(columns=["holdout_number"]),
    groupby=["edge_embedding_method",  "model", "run"],
    sort_bars=sort_bars,
    subplots=True,
    show_legend=False,
    orientation="horizontal",
    # height=4,
    # bar_width=0.15,
    # space_width=0.1,
    # legend_position="lower right",
    custom_defaults=custom_defaults,
    use_multiprocessing=True,
)

In [None]:
from glob import glob
for path in glob(f"{embedding_data_dir}/**/*.csv.gz", recursive=True):
    os.system(f"s3cmd put --acl-public --cf-invalidate {path} {s3_path}")