# Graph embedding using SkipGram

This is an embedding of the whole graph, 80/20 training and validation split and all sources

kg-covid-19:
version 20201012

Name: ensmallen-graph
Version: 0.4.4

Name: embiggen
Version: 0.6.0

In [1]:
from pkg_resources import get_distribution
assert(get_distribution("ensmallen-graph").version == '0.4.4')  # identical to 0.4.3 except for addition of some methods like get_edge_id()
assert(get_distribution("embiggen").version == '0.6.0')

In [2]:
import os

exp_name = "80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_updated_holdouts"
s3_path = "s3://kg-hub-public-data/embeddings/20201012/"  # keep trailing slash

base_dl_dir = "downloaded_data"
graph_data_dir = os.path.join(base_dl_dir, "kg-covid-19-20201012")
embedding_data_dir = os.path.join(base_dl_dir, "embeddings-20201012")

# graph stuff
graph_out_file = os.path.join(graph_data_dir + "/kg-covid-19.tar.gz")
nodes_file = os.path.join(graph_data_dir, "merged-kg_nodes.tsv")
edges_file = os.path.join(graph_data_dir, "merged-kg_edges.tsv")
sorted_edges_file = os.path.join(graph_data_dir, "merged-kg_edges_SORTED.tsv")
graph_tar_url = "https://kg-hub.berkeleybop.io/kg-covid-19/20201012/kg-covid-19.tar.gz"

# embeddings URLs
base_kghub_url = "http://kg-hub.berkeleybop.io/"
embeddings_url = os.path.join(base_kghub_url, "embeddings/20201012/SkipGram_80_20_kg_covid_19_20201012_training_test_epoch_500_delta_0.0001_embedding.npy")
embedding_file = os.path.join(embedding_data_dir, "SkipGram_embedding.npy")

# params
seed = 42
train_percentage = 0.8

In [3]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

## Loading the graphs
We load the kg-covid-19 graph from the repository as an undirected graph.

In [4]:
# get the graphs, if necessary

import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)

if not os.path.exists(nodes_file) or not os.path.exists(edges_file):
    with urllib.request.urlopen(graph_tar_url) as response, \
        open(graph_out_file, 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)
    os.system("tar -xvzf " + graph_out_file + " -C " + graph_data_dir)

#### only need to do this once, b/c we'll load the sorted.tsv from now on once it is made below

In [5]:
import pandas as pd
edges = pd.read_csv(graph_data_dir + "/merged-kg_edges.tsv", "\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
sars_cov_2_curie = 'NCBITaxon:2697049'
chembl_prefix = 'CHEMBL.COMPOUND'

In [7]:
chembl_to_sars_cov_2_edges = (
    (edges.subject.str.contains(chembl_prefix) & (edges.object == sars_cov_2_curie)) | 
    (edges.object.str.contains(chembl_prefix) & (edges.subject == sars_cov_2_curie))
)

In [8]:
edges['holdout_edge_label'] = [
    'chembl_to_sars_cov_2' if value else 'normal'
    for value in chembl_to_sars_cov_2_edges]

In [9]:
import os
new_edge_file = os.path.join(graph_data_dir, 'edges_with_holdout_column.tsv')
edges.to_csv(new_edge_file, sep="\t", index=False)

In [10]:
from ensmallen_graph import EnsmallenGraph
graph = EnsmallenGraph.from_unsorted_csv(
    name="kg-covid-19",
    edge_path = new_edge_file,
    sources_column="subject",
    destinations_column="object",
    edge_types_column='holdout_edge_label',
    directed=False,
    node_path = graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column = 'id',
    node_types_column = 'category',
    default_node_type = 'biolink:NamedThing'
)

In [11]:
graph

The undirected graph kg-covid-19 has 447766 nodes with 42 different node types:  the 5 most common are biolink:Publication (nodes number 129930), biolink:OntologyClass (nodes number 108266), biolink:Drug (nodes number 32120), biolink:ChemicalSubstance (nodes number 27157) and biolink:Disease (nodes number 24236), of which 8355 are singletons, and 15611957 unweighted edges with 2 different edge types: normal and chembl_to_sars_cov_2, of which 480 are self-loops. The graph is quite sparse as it has a density of 0.00016 and has 9107 connected components, where the component with most nodes has 435728 nodes and the component with the least nodes has 1 nodes. The graph median node degree is 4, the mean node degree is 69.73 and the node degree mode is 1. The top 5 most central nodes are MESH:D014780 (degree 90378), MESH:D006801 (degree 78249), WD:Q30 (degree 65223), MESH:D014777 (degree 54155) and MESH:D017934 (degree 45196).

In [12]:
reduced_graph = graph.remove_components(edge_types=['chembl_to_sars_cov_2'])
reduced_graph = reduced_graph.remove(singletons=True)

In [13]:
reduced_graph

The undirected graph kg-covid-19 has 435728 nodes with 40 different node types:  the 5 most common are biolink:Publication (nodes number 129492), biolink:OntologyClass (nodes number 104951), biolink:Drug (nodes number 32016), biolink:ChemicalSubstance (nodes number 27152) and biolink:Disease (nodes number 22281) and 15608444 unweighted edges with 2 different edge types: normal and chembl_to_sars_cov_2, of which 314 are self-loops. The graph is quite sparse as it has a density of 0.00016 and is connected, as it has a single component. The graph median node degree is 4, the mean node degree is 71.64 and the node degree mode is 1. The top 5 most central nodes are MESH:D014780 (degree 90378), MESH:D006801 (degree 78249), WD:Q30 (degree 65223), MESH:D014777 (degree 54155) and MESH:D017934 (degree 45196).

## Creating the SkipGram model
We are going to setup the model to use, if available, multiple GPUs.

In [14]:
from cache_decorator import Cache
from tensorflow.distribute import MirroredStrategy
from tensorflow.keras.optimizers import Nadam
from embiggen import SkipGram
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from embiggen import Node2VecSequence


@Cache(
    cache_path="{cache_dir}/SkipGram/{_hash}_{holdout_idx}.csv.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=['train_graph']
)
def compute_skipgram_embedding(
    train_graph: EnsmallenGraph,
    holdout_idx: int,
    walk_length: int = 100,
    batch_size: int = 2**9,
    iterations: int = 20,
    return_weight: float = 1.0,
    explore_weight: float = 1.0,
    embedding_size: int = 100,
    window_size: int = 4,
    negative_samples: int = 7,
    patience: int = 6,
    delta: float = 0.1,
    epochs: int = 500
):
    """Return dataframe with node embedding obtained with SkipGram for train_graph
    
    Given a graph, learn embeddings and return dataframe with node embeddings
    
    Parameters
    -----
    train_graph: EnsmallenGraph
    holdout_idx: int, 
        an int to identify the holdout
    walk_length: int = 100,
        how many nodes for each walk
    batch_size: int = 2**9,
        how many walks for each batch
    iterations: int = 20,
        how many walks per node
    return_weight: float = 1.0,
        node2vec param, equal to 1/p
    explore_weight: float = 1.0,
        node2vec param, equal to 1/q
    embedding_size: int = 100,
        dimensions for embedding
    window_size: int = 4,
        SkipGram window size
    negative_samples: int = 7,
        how many negative samples that NCE function needs to sample
    patience: int = 6,
        how many epochs to wait for loss fxn to improve by [delta]
    delta: float = 0.1
        change in loss fxn to be considered an improvement
        
    Return:
    -------
    pd.DataFrame containing an embedding for each node in train_graph
    """
    training_sequence = Node2VecSequence(
        train_graph,
        walk_length=walk_length,
        batch_size=batch_size,
        iterations=iterations,
        window_size=window_size,
        return_weight=return_weight,
        explore_weight=explore_weight,
        support_mirror_strategy=True
    )

    strategy = MirroredStrategy()
    with strategy.scope():
        model = SkipGram(
            vocabulary_size=train_graph.get_nodes_number(),
            embedding_size=embedding_size,
            window_size=window_size,
            negative_samples=negative_samples,
        )

    history = model.fit(
        training_sequence,
        steps_per_epoch=training_sequence.steps_per_epoch,
        epochs=epochs,
        callbacks=[
            EarlyStopping(
                "loss",
                min_delta=delta,
                patience=patience,
                restore_best_weights=True
            ),
            ReduceLROnPlateau(
                monitor="loss",
                patience=patience//2,
                min_delta=delta
            )
        ]
    )
    return model.get_embedding_dataframe(train_graph.get_nodes_reverse_mapping())

# Link prediction models

In [16]:
from typing import List, Dict, Tuple
import numpy as np

def create_ranking(edges: List[Tuple[str, str]], predictions: np.ndarray) -> Dict:
    """Return ranking of edges and predictions.

    Parameters
    ---------------------
    edges: List[Tuple[str, str]],
        Edges to be predicted.
    predictions: np.ndarray,
        Predictions of the model.

    Returns
    ---------------------
    Dictionary of the ranking.
    """
    return dict(zip(edges, predictions))

In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.metrics import AUC, Recall, Precision
from typing import Dict

@Cache(    
    cache_path="model_predictions/mlp/{embedding_model}/{holdout}_{_hash}.pkl.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges", "batch_size"]
)
def mlp(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]],
    epochs: int = 500,
    batch_size: int = 256,
    patience: int = 10,
    min_delta: float = 0.000001,
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return random forest predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = Sequential([
        Input(train_X.shape[1:]),
        Dense(64, activation="relu"),
        Dense(32, activation="relu",
              activity_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.3),
        Dense(8, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=[
            AUC(curve="PR", name="auprc"),
            AUC(curve="ROC", name="auroc"),
            Recall(name="Recall"),
            Precision(name="Precision"),            
            "accuracy"
        ]
    )
    model.fit(
        train_X, train_y,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[
            EarlyStopping("loss", patience=patience, min_delta=min_delta)
        ]
    )
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    return train_pred, test_pred, create_ranking(
        edges, model.predict(graph_test_X)
    )    

In [18]:

"""Submodule offering method to execute cached random forest."""
from typing import Tuple, Dict, List
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import cpu_count
from cache_decorator import Cache

@Cache(    
    cache_path="model_predictions/random_forest/{embedding_model}/{holdout}_{_hash}.pkl.gz",
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def random_forest(
    holdout: int,
    edge_embedding_method: str,
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return random forest predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        n_jobs=cpu_count(),
        class_weight="balanced_subsample",
        max_samples=0.5
    )
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])

In [19]:
from typing import Tuple, Dict, List
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from cache_decorator import Cache

@Cache(
    cache_path="{cache_dir}/model_predictions/decision_tree/{embedding_model}/{holdout}_{_hash}.pkl.gz",    
    cache_dir=embedding_data_dir,    
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def decision_tree(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """Return decision tree predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = DecisionTreeClassifier(max_depth=10, class_weight="balanced")
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])


In [42]:
from typing import Tuple, Dict, List
import numpy as np
from sklearn.linear_model import LogisticRegression
from cache_decorator import Cache

@Cache(
    cache_path="{cache_dir}/model_predictions/logistic_regression/{embedding_model}/{holdout}_{_hash}.pkl.gz",    
    cache_dir=embedding_data_dir,
    args_to_ignore=["train_X", "train_y", "test_X", "graph_test_X", "edges"]
)
def logistic_regression(
    holdout: int,
    edge_embedding_method: str,    
    embedding_model: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    test_X: np.ndarray,
    graph_test_X: np.ndarray,
    edges: List[Tuple[str, str]]
) -> Tuple[np.ndarray, np.ndarray, LogisticRegression]:
    """Return logistic regression predictions on the given values.

    Parameters
    ----------------------
    holdout: int,
        Number of the holdout.
    edge_embedding_method: str,
        Name of the edge embedding model.
    embedding_model: str,
        Name of the embedding model.
    train_X: np.ndarray,
        Data to use as input for training process.
    train_y: np.ndarray,
        Data to use as output for the training process.
    test_X: np.ndarray,
        Data to use as input for the testing process.
    graph_test_X: np.ndarray,
        Data to use for the ranking of the edges.
    edges: List[Tuple[str, str]],
        Edge names to be ranked.

    Returns
    ----------------------
    Tuple with training and test predictions.
    """
    model = LogisticRegression(class_weight="balanced", max_iter=1000)
    model.fit(train_X, train_y)
    train_pred = model.predict_proba(train_X)[:, 1]
    test_pred = model.predict_proba(test_X)[:, 1]
    return train_pred, test_pred, create_ranking(edges, model.predict_proba(graph_test_X)[:, 1])


In [21]:
nodes_of_interest = ['biolink:Drug', 'biolink:ChemicalSubstance']
node_idx = [reduced_graph.get_node_types_reverse_mapping().index(noi) for noi in nodes_of_interest]

In [22]:
allow_set = {
    reduced_graph.get_node_name(node_id)
    for node_id in range(reduced_graph.get_nodes_number())
    if reduced_graph.get_node_type(node_id) in node_idx
}

In [23]:
graph_drugs_only = reduced_graph.remove(allow_nodes_set=allow_set)

In [39]:
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, recall_score, precision_score
from sanitize_ml_labels import sanitize_ml_labels
from typing import Dict
from sklearn.metrics import confusion_matrix


def specificity_score(y_true: np.ndarray, y_pred: np.ndarray):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

def get_metrics_report(y_true: np.ndarray, y_pred: np.ndarray)->Dict[str, float]:
    float_metrics = (average_precision_score, roc_auc_score)
    integer_metrics = (accuracy_score, recall_score, precision_score, specificity_score)
    integer_y_pred = y_pred.round().astype(int)

    return {
        **{
            sanitize_ml_labels(metric.__name__): metric(y_true, integer_y_pred)
            for metric in integer_metrics
        },
        **{
            sanitize_ml_labels(metric.__name__): metric(y_true, y_pred)
            for metric in float_metrics
        }
    }


In [48]:
# make holdouts
from tqdm.auto import trange, tqdm
from embiggen import EdgeTransformer, GraphTransformer, LinkPredictionTransformer

link_prediction_models = [
    mlp, 
    logistic_regression,
    random_forest,
    decision_tree
]

rankings = {}
results = []

# make negative graph
neg_graph = reduced_graph.sample_negatives(
            seed_graph=graph_drugs_only,
            random_state=seed,
            only_from_same_component=True,
            negatives_number=chembl_to_sars_cov_2_edges.sum() * 10
        )

for holdout in trange(5, desc="computing embeddings"): # TODO: increase holdout to 15-20
    
    pos_training, pos_validation = reduced_graph.connected_holdout(
        train_size=train_percentage, 
        edge_types=['chembl_to_sars_cov_2'],
        random_state=seed + holdout)        

    neg_training, neg_validation = neg_graph.random_holdout(
        random_state=seed + holdout, train_size=train_percentage
    )    

    pos_training.enable_fast_walk()
    embedding = compute_skipgram_embedding(pos_training, holdout)    
    
    for edge_embedding_method in tqdm(EdgeTransformer.methods,
                                      desc="edge embeddings",
                                      leave=False):
        rankings[edge_embedding_method] = rankings.get(edge_embedding_method, {})

        graph_transformer = GraphTransformer(method=edge_embedding_method)
        graph_transformer.fit(embedding)
        graph_test_X = graph_transformer.transform(pos_validation)

        transformer = LinkPredictionTransformer(method=edge_embedding_method)
        transformer.fit(embedding)
        train_X, train_y = transformer.transform(
            pos_training.filter(edge_types=['chembl_to_sars_cov_2']), 
            neg_training
        )
        test_X, test_y = transformer.transform(pos_validation, neg_validation)
    
        for link_prediction_model in tqdm(link_prediction_models, desc="models", leave=False):
            rankings[edge_embedding_method][link_prediction_model] = rankings[edge_embedding_method].get(link_prediction_model, [])            
            train_pred, test_pred, link_prediction_ranking = link_prediction_model(
                holdout=holdout,
                edge_embedding_method=edge_embedding_method,
                embedding_model='SkipGram',
                train_X=train_X,
                train_y=train_y,
                test_X=test_X,
                graph_test_X=graph_test_X,
                edges=list(zip(
                    pos_validation.get_source_names(),
                    pos_validation.get_destination_names(),
                ))
            )
            rankings[edge_embedding_method][link_prediction_model].append(link_prediction_ranking)
            
            for run, true, predictions in (
                ("train", train_y, train_pred),
                ("test", test_y, test_pred)
            ):
                results.append({
                    "run": run,
                    "model": link_prediction_model.__name__,
                    "edge_embedding_method": edge_embedding_method,
                    "holdout_number": holdout,
                    **get_metrics_report(true, predictions)
                })

HBox(children=(FloatProgress(value=0.0, description='computing embeddings', max=5.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='edge embeddings', max=4.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500


HBox(children=(FloatProgress(value=0.0, description='edge embeddings', max=4.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500


HBox(children=(FloatProgress(value=0.0, description='edge embeddings', max=4.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500


Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500


Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500


HBox(children=(FloatProgress(value=0.0, description='edge embeddings', max=4.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500


HBox(children=(FloatProgress(value=0.0, description='edge embeddings', max=4.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500


Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500


HBox(children=(FloatProgress(value=0.0, description='models', max=4.0, style=ProgressStyle(description_width='…

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500



## Upload the weights and embeddings

In [52]:
results_pd

Unnamed: 0,run,model,edge_embedding_method,holdout_number,Accuracy,Recall,Precision,Specificity score,AUPRC,AUROC
0,train,mlp_cached,hadamard,0,0.999909,1.000000,0.999457,0.999891,0.999961,0.999993
1,test,mlp_cached,hadamard,0,0.987681,0.981159,0.946853,0.988986,0.975012,0.996951
2,train,logistic_regression_cached,hadamard,0,0.866818,0.493659,0.627736,0.941449,0.532881,0.577186
3,test,logistic_regression_cached,hadamard,0,0.865942,0.499275,0.621841,0.939275,0.556214,0.578182
4,train,random_forest_cached,hadamard,0,0.983303,1.000000,0.908941,0.979964,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...
155,test,logistic_regression_cached,weightedL2,4,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
156,train,random_forest_cached,weightedL2,4,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
157,test,random_forest_cached,weightedL2,4,0.999034,0.994203,1.000000,1.000000,0.999998,1.000000
158,train,decision_tree_cached,weightedL2,4,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [70]:
rankings

{'hadamard': {<function __main__.mlp(holdout: int, edge_embedding_method: str, embedding_model: str, train_X: numpy.ndarray, train_y: numpy.ndarray, test_X: numpy.ndarray, graph_test_X: numpy.ndarray, edges: List[Tuple[str, str]], epochs: int = 500, batch_size: int = 256, patience: int = 10, min_delta: float = 1e-06) -> Tuple[numpy.ndarray, numpy.ndarray, Dict]>: [{('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL1200692'): array([1.], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL365739'): array([0.99999464], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL113'): array([1.], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL231779'): array([0.9999176], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL529'): array([1.], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL600'): array([0.99999976], dtype=float32),
    ('NCBITaxon:2697049',
     'CHEMBL.COMPOUND:CHEMBL389621'): array

In [69]:
from barplots import barplots

results_pd = pd.DataFrame(results)


def sort_bars(df:pd.DataFrame)->pd.DataFrame:
    return df.sort_values(["model", "run"], ascending=[True, False])

custom_defaults={
    "Decision Tree": "decision_tree_cached",
    "Logistic Regression": "logistic_regression_cached",
    "Random Forest": "random_forest_cached"
}

figures = barplots(
    results_pd.drop(columns=["holdout_number"]),
    groupby=["edge_embedding_method",  "model", "run"],
    sort_bars=sort_bars,
    subplots=True,
    show_legend=False,
    orientation="horizontal",
    # height=4,
    # bar_width=0.15,
    # space_width=0.1,
    # legend_position="lower right",
    custom_defaults=custom_defaults,
    use_multiprocessing=True,
)

HBox(children=(FloatProgress(value=0.0, description='Rendering barplots', layout=Layout(flex='2'), max=6.0, st…




In [None]:
from glob import glob
for path in glob(f"{embedding_data_dir}/**/*.csv.gz", recursive=True):
    os.system(f"s3cmd put --acl-public --cf-invalidate {path} {s3_path}")