In [1]:
!pip install grape

Collecting grape
  Using cached grape-0.2.2.tar.gz (9.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting downloaders
  Downloading downloaders-1.0.20.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bioregistry
  Using cached bioregistry-0.10.66-py3-none-any.whl (10.4 MB)
Collecting py-cpuinfo
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting ensmallen>=0.8.64
  Downloading ensmallen-0.8.86-cp37-abi3-macosx_10_12_x86_64.whl (29.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting embiggen>=0.11.61
  Using cached embiggen-0.11.83.tar.gz (240 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy
  Downloading numpy-1.26.1-cp311-cp311-macosx_10_9_x86_64.whl (20.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m20.6 MB/s[0m eta [36m0:00

In [1]:
from grape.datasets.kghub import KGIDG
import pandas as pd
from grape.edge_prediction import edge_prediction_evaluation
from grape.edge_prediction import PerceptronEdgePrediction
from grape.embedders import FirstOrderLINEEnsmallen, SecondOrderLINEEnsmallen
from grape.embedders import DeepWalkCBOWEnsmallen, DeepWalkSkipGramEnsmallen
from grape.embedders import WalkletsCBOWEnsmallen, WalkletsSkipGramEnsmallen

The library will use a slower but more compatible version (Intel Core2 2006).


In [2]:
g = KGIDG(version='20230601')
main_component = g.remove_components(top_k_components=1)
dense_main_component = main_component.remove_dendritic_trees()
drug_types = ["biolink:ChemicalSubstance", "biolink:ChemicalEntity", "biolink:Drug"]
protein_types = ["biolink:Protein"]
minority_edge_type = 'minority_edge'
dense_main_component.replace_edge_type_name_from_edge_node_type_names_inplace(
    edge_type_name=minority_edge_type,
    source_node_type_names=drug_types,
    destination_node_type_names=protein_types
)
dense_main_component

In [None]:
# Set smoke test to True for testing:
# SMOKE_TEST = True
NUMBER_OF_HOLDOUTS = 10
VALIDATION_UNBALANCE_RATES = (1.0, )
TRAIN_SIZES = (0.75,)
subgraph = dense_main_component.filter_from_names(
    edge_type_names_to_keep=[minority_edge_type]
)
# Disable the smoke test when you need to run the real thing:
SMOKE_TEST = False # <- I WILL CHECK THIS ONCE AGAIN, BUT I AM PRETTY SURE I RAN IT WITH FALSE
NUMBER_OF_HOLDOUTS = 10
VALIDATION_UNBALANCE_RATES = (1.0, )
TRAIN_SIZES = (0.75,)
results = []
for train_size in TRAIN_SIZES:
    for validation_use_scale_free in (True, False):
        for ModelClass in  [
                FirstOrderLINEEnsmallen, SecondOrderLINEEnsmallen,
                DeepWalkCBOWEnsmallen, DeepWalkSkipGramEnsmallen,
                WalkletsCBOWEnsmallen, WalkletsSkipGramEnsmallen,
            ]:
            # If the embedding method involves edge sampling, we train a run
            # using the scale free and one using the uniform.
            if "use_scale_free_distribution" in ModelClass().parameters():
                parameter_sets = [
                    dict(
                        use_scale_free_distribution = True
                    ),
                    dict(
                        use_scale_free_distribution = False
                    )
                ]
            else:
                parameter_sets = [dict()]
            print(f"train size: {train_size}; validation_use_scale_free: {validation_use_scale_free} ModelClass {ModelClass}")
            for parameter_set in parameter_sets:
                results.append(edge_prediction_evaluation(
                    smoke_test=SMOKE_TEST,
                    holdouts_kwargs=dict(
                        train_size=train_size,
                        edge_types=[minority_edge_type],
                    ),
                    evaluation_schema="Connected Monte Carlo",
                    node_features=ModelClass(**parameter_set),
                    graphs=dense_main_component,
                    models=[
                        PerceptronEdgePrediction(
                            edge_features=None,
                            edge_embeddings="Hadamard",
                            number_of_edges_per_mini_batch=32,
                            use_scale_free_distribution=use_scale_free_distribution
                        )
                        for use_scale_free_distribution in (True, False)
                    ],
                    enable_cache=True,
                    number_of_holdouts=NUMBER_OF_HOLDOUTS,
                    use_scale_free_distribution=validation_use_scale_free,
                    validation_unbalance_rates=VALIDATION_UNBALANCE_RATES,
                    subgraph_of_interest=subgraph,
                    use_subgraph_as_support=False
                ))
results = pd.concat(results)
results.to_csv("kg_idg_negative_select.tsv",sep="\t")

train size: 0.75; validation_use_scale_free: True ModelClass <class 'embiggen.embedders.ensmallen_embedders.first_order_line.FirstOrderLINEEnsmallen'>


Evaluating on KGIDG:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]