In [1]:
#  this Notebook test how to perform an ablation study with pykeen to find the perfect model, loss, optimizer, and hyperparameters
# To do so, we must use the metrics from pykeen and split out graph data in train/test/validation set.
# if the best parameters where filtered out, test those in the start_all.sh setting to see how it actual performs
# They may be is a difference in the evaluation progress, because pykeen focuses mostly on link-prediction task..

import pykeen
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
import torch
import torch.nn
import argparse
from pykeen.regularizers import Regularizer, regularizer_resolver
from pykeen.utils import resolve_device
from pykeen.ablation import ablation_pipeline

#import all models and parameter
from pykeen.models  import ConvE, TransE, ComplEx, MuRE, RotatE, TuckER, DistMult, RESCAL, NodePiece
from pykeen.training import SLCWATrainingLoop
from pykeen.losses import BCEWithLogitsLoss, SoftplusLoss, NSSALoss, SoftMarginRankingLoss, PairwiseLogisticLoss
from pykeen.datasets import Nations


In [2]:
# Load the connected Node list and split the dataset, save it in .txt form in a specific file

triple_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/triples/triple_list_graph_full_v3.feather'
tripleAr = pd.read_feather(triple_path).to_numpy()
print(tripleAr)
# shuffle array and split array, save as txt file
np.random.shuffle(tripleAr)
print(tripleAr[0], type(tripleAr))
np.random.shuffle(tripleAr)
print(tripleAr[0], len(tripleAr))


[['OMOP_21600001' '0' 'OMOP_21600509']
 ['OMOP_21600001' '0' 'OMOP_21600531']
 ['OMOP_21600001' '0' 'OMOP_21600697']
 ...
 ['OT_R-HSA-9694676' '205' 'OT_ENSG00000164695']
 ['OT_R-HSA-9694676' '205' 'OT_ENSG00000126581']
 ['OT_R-HSA-9694676' '205' 'OT_ENSG00000147457']]
['OT_ENSG00000135002' '200' 'OT_ENSG00000159399'] <class 'numpy.ndarray'>
['OMOP_36703920' '0' 'OMOP_36691270'] 24225252


In [3]:
# calculate the fractions
"""t1 = round(len(tripleAr) *0.8)
t2 = round(len(tripleAr) *0.1)
t3 = round(len(tripleAr) *0.1)
print(t1, t2, t3, t1+t2+t3)
train, test, validate = tripleAr[:t1], tripleAr[t1:t1+t2], tripleAr[t1+t2:]
print(len(train)+len(test)+len(validate), len(train), len(test), len(validate))

# save fractions as .txt file 
print(validate, type(validate))
np.savetxt("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/train.txt", train, delimiter=';', fmt='%s')
np.savetxt("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/test.txt", test, delimiter=';' , fmt='%s')
np.savetxt("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/validate.txt", validate, delimiter=';' , fmt='%s')
"""

19380202 2422525 2422525 24225252
24225252 19380202 2422525 2422525
[['OMOP_199067' '128' 'phecode_614']
 ['OMOP_46063965' '1' 'OMOP_37209200']
 ['OT_ENSG00000206503' '50' 'OMOP_4047650']
 ...
 ['OMOP_4283942' '50' 'OT_ENSG00000140505']
 ['OT_ENSG00000175336' '200' 'OT_ENSG00000147262']
 ['OT_ENSG00000121858' '50' 'OMOP_4181483']] <class 'numpy.ndarray'>


In [2]:
# SAVE SPLITED PATH WITH PYKEENN !!!

#triple_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/triples/triple_list_graph_full_v3.feather'
random_triple_graph_path= "/home/tilingl/Pykeen/Triple_Lists/random_ego_graph2.feather" # entities: 186355 relations: 59 ; 26.6% von 700300
py_tripleAr = pd.read_feather(random_triple_graph_path).to_numpy()
tf = TriplesFactory.from_labeled_triples(py_tripleAr, create_inverse_triples=False)
print(tf.num_entities, tf.num_relations)

training_tf, testing_tf, validation_tf = tf.split([0.8,0.1,0.1], random_state=torch.Generator(device='cpu'))

testing_tf.to_path_binary('/home/tilingl/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/test')
training_tf.to_path_binary('/home/tilingl/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/train')
validation_tf.to_path_binary('/home/tilingl/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/val')


57367 112


PosixPath('/home/tilingl/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/val')

In [10]:
val = Nations.validation
test = Nations.testing

In [5]:
# STARTING THE ABLATION STUDY #

directory = "/home/tilingl/Pykeen/New_Embedding_Stuff/ablation/TransE/"
# define HPO ranges
model_to_model_kwargs_ranges = {
    "TransE": {
        "embedding_dim": {
            "type": "int",
            "low": 8,
            "high": 11,
            "scale": "power_two"
        }
    }    
}

model_to_training_loop_to_training_kwargs = {
    "TransE": {
        "slcwa": {
            "num_epochs": 50
        }
    }
}

model_to_training_loop_to_training_kwargs_ranges= {
   "TransE": {
       "slcwa": {
           "label_smoothing": {
               "type": "float",
               "low": 0.001,
              "high": 1.0,
               "scale": "log"
           },
           "batch_size": {
               "type": "int",
               "low": 7,
               "high": 15,
               "scale": "power_two"
           }
       }
   }
}

model_to_optimizer_to_optimizer_kwargs_ranges= {
   "TransE": {
       "adam": {
           "lr": {
               "type": "float",
               "low": 0.001,
               "high": 0.1,
               "scale": "log"
           }
       }
   }
}


ablation_pipeline(
    directory=directory,
    metadata = dict(title="Ablation Study Over Graph Data for TransE."),
    models=["TransE"],
    datasets=["Nations"], # filled with GraphData binarys
    losses=["BCEWithLogitsLoss", "MarginRankingLoss", "SoftplusLoss", "NSSALoss", "SoftMarginRankingLoss", "PairwiseLogisticLoss"],
    training_loops=["SLCWA"],
    optimizers=["Adam"],
    model_to_model_kwargs_ranges=model_to_model_kwargs_ranges,
    model_to_training_loop_to_training_kwargs=model_to_training_loop_to_training_kwargs,
    model_to_optimizer_to_optimizer_kwargs_ranges=model_to_optimizer_to_optimizer_kwargs_ranges,
    create_inverse_triples=[True, False],
    stopper="early",
    stopper_kwargs={
        
        "frequency": 2,
        "patience": 10,
        "relative_delta": 0.002,
        "metric": "hits@10",
    },
    evaluator="RankBasedEvaluator",
    evaluator_kwargs={
        "slice_size":128,
        "filtered":True,# due to KGE comparison paper
    },
    # Optuna-related arguments
    n_trials=5,
    timeout=300,
    metric="hits@10",
    direction="maximize",
    sampler="random",
    pruner= "nop",
    best_replicates=3,

)


[32m[I 2022-05-17 15:51:11,130][0m A new study created in RDB with name: no-name-9c1b8508-c1ec-422d-ad8e-e6a9c5318b8f[0m
No random seed is specified. Setting to 3800767416.
No cuda devices were available. The model runs on CPU
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on cpu:   0%|          | 0/400 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training batches on cpu:   0%|          | 0/14382 [00:00<?, ?batch/s]

KeyboardInterrupt: 