In [2]:
# script to train standard models, extract and save embeddings via Hyperparameter argparser input
# should enable a better automatic workflow for starting multiple runs with only one script
# using Pykeen==1.8.0 (slightly different functions to pykeen==1.7.0)
# needs triple_list_graph_full_v3.feather for faster loading of the "graph"
# --> /sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/ triples
# corresponding python script: auto_generate_embs.py

# TODO:

import pykeen
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
import torch
import argparse
from pykeen.regularizers import Regularizer, regularizer_resolver
from pykeen.utils import resolve_device

#import all models and parameter
from pykeen.models  import ConvE, TransE, ComplEx, MuRE, RotatE, TuckER
from pykeen.training import LCWATrainingLoop, SLCWATrainingLoop
from pykeen.losses import BCEWithLogitsLoss

# testing:
from pykeen.datasets import Nations


In [8]:
# helper function to return the classes and not argparsed strings
def help_model(m_str):
    if m_str == "ConvE":
        return ConvE
    if m_str == "TransE":
        return TransE
    if m_str == "ComplEx":
        return ComplEx
    if m_str == "MuRE":
        return MuRE
    if m_str == "RotatE":
        return RotatE
    if m_str == "TuckER":
        return TuckER
    else:
        return "unknown, add Model to help_model func!"

def help_loss(l_str):
    if l_str is None:
        return None

def help_loop(loop_str):
    if loop_str == "SLCWATrainingLoop":
        return SLCWATrainingLoop
    else:
        return LCWATrainingLoop
    


In [3]:
# function to start the embedding training with all variables and parameters
def start_emb_train(triple_path, inverse, Model,
                    emb_dim, Training_loop,
                    check_name: str, emb_dict_name: str,
                    batch_s=None, sub_batch=None, 
                    slice_s=None, Loss=None,):
    
    # make sure, that gpu is avaiable and chosen 
    device = 'cpu' ## 'gpu'
    _device: torch.device = resolve_device(device)
    print(f"Using device: {device}", type(_device))
    
    # generate Triples Factory
    ##tripleArray = pd.read_feather(triple_path).to_numpy()
    ##print('length of the triple array: ', len(tripleArray), type(tripleArray))
    ##tf = TriplesFactory.from_labeled_triples(tripleArray, create_inverse_triples=inverse)
    dataset = Nations()
    tf = dataset.training
    
    print('loading TriplesFactory done ... ', type(tf))
    print(tf.num_entities, tf.num_relations)    # old=511291 338 oldest= 511291 326
    
    #pick a Model that was imported
    #choose a loss Class that was imported, default =None
    kwargs={'triples_factory': tf, 'loss': Loss, 'predict_with_sigmoid':False}
    model = Model(**kwargs, embedding_dim=emb_dim, random_seed=420)
    model= model.to(_device) # important otherwise fall back to cpu
    
    # Pick an optimizer from Torch
    from torch.optim import Adam
    optimizer = Adam(params=model.get_grad_params())
    
    # Pick a training approach that was imported !! contains the losses, choose between SLCWATrainingLoop and LCWATrainingLoop
    training_loop = Training_loop(

        model=model,
        triples_factory=tf,
        optimizer=optimizer,
        automatic_memory_optimization=True, #default =True
    )
    
    
    # just run for one epoch, evaluate losses and restart training where it was left
    for i in range(1,100):
        # Train like Cristiano Ronaldo
        losses = training_loop.train(
            triples_factory=tf,
            num_epochs=i,
            batch_size=batch_s, #256, # if None -> automatic search for the best and greatest
            checkpoint_name= check_name, # for example TransE_t2.pt
            checkpoint_frequency=0, 
            
            sub_batch_size=sub_batch, # not for SLCWA and not supported bc of batch normalization!!
            slice_size=slice_s, # not for SLCWA

        )
        if i>1 and (losses[-2] - losses[-1]) < 1e-7: # changed 1e-6 to 1e-7
            
            #TODO: function to obtain embeddings
            #switch model back to cpu device:
            _device = torch.device('cpu') # 
            model.to(_device)
            #do not need anymore: entity_RepModel = model.entity_representations[0] # check for more representations
            try:
                print(model.entity_representations[1])
            except IndexError:
                print('\n','Index Error, no more entity_reps ', '\n')
            # acces entity_values, mapp them to list and pass the list to the entity_repModel to recieve Embeddings. Next, create embedding_dict and transform to DataFrame
            ##nodes = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/connected_nodes_list.feather')
            print(tf.entity_to_id, type(tf.entity_to_id),'e_ids', tf.entity_ids, tf.entity_to_id.keys())
            entity_id_dict = tf.entity_to_id
            entity_ids_as_tensors = torch.as_tensor([int(v) for v in entity_id_dict.values()], dtype=torch.long, device=_device)
            
            # casting node Names from list into equivalent ids(indices) as a torch.LongTensor on CPU --> bc model is also cpu
            ##entity_ids = torch.as_tensor(tf.entities_to_ids(nodes['nodes']), dtype=torch.long, device=_device) # .view()?
            
            #all embeddings as a numpy.ndarray, indices as torch.LongTensor
            entity_embeddings = model.entity_representations[0](indices=entity_ids_as_tensors).detach().numpy() # detach tensor 
            
            # do not need anymore : embeddings = entity_RepModel(entity_ids) 
            df_dict = pd.DataFrame(dict(nodes= entity_id_dict.keys(), embeddings=list(entity_embeddings)))
            print(df_dict.head())
            # save embedding dict
            emb_path = '/home/tilingl/Pykeen/New_Embedding_Stuff/Embeddings/Embedding_dict_' + emb_dict_name + '.feather'
            print('saved in: ',emb_path)
            df_dict.to_feather(emb_path)
          
            #end the loop
            break
            

In [4]:

print("Script for creating TEST embeddings")

from pykeen.training import LCWATrainingLoop, SLCWATrainingLoop
from pykeen.losses import BCEWithLogitsLoss

triple_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/triples/triple_list_graph_full_v3.feather'
    # sub_batching and slicing only for SLCWA
    # triple_path, inverse, Model, emb_dim, Training_loop, check_name: str, embedding_dict Name: str, batch_size, sub_batch_size, slice_size,  Loss=None
#start_emb_train(triple_path, True, TransE, 1024, SLCWATrainingLoop, 'TEST1.pt', 'TEST_full_v3', batch_s=None, sub_batch=None, slice_s=None, Loss=None)


Script for creating TEST embeddings


In [7]:
# argparser

# argparser for all hyperparameter Options
parser = argparse.ArgumentParser()
#parser.add_argument('triple_path', help="declare the path to the triple.feather file", type=str)
parser.add_argument('inverse', help= "add True/False wether inverted triple should created", type=bool)
parser.add_argument('Model', help="choose the model to train with. NOT NodePiece!", type=str)
parser.add_argument('emb_dimension', help="specify embedding dimension" , type=int)
parser.add_argument('SLCWA_LCWA', help="SLCWATrainingLoop or LCWA training Loop", type=str)
parser.add_argument('check_Name.pt', help=" choose name for the checkpoint.pt", type=str)
parser.add_argument('emb_dict_name', help="name for the Embedding_dictionary, newest= xxx_full_v3", type=str)
parser.add_argument('Loss', help="specify Loss class , default None", type=str)
# add optional arguments
parser.add_argument('-b', '--batch_size', help="select batch size, if none: automatic batchsize search", type=int)
parser.add_argument('-sb', '--sub_batch_size', help="only for LCWA, sub batch_size for effizient memory usage", type=int)
parser.add_argument('-s', '--slize_size', help="only for LCWA, divisor for slicing batches for single calculations", type=int)

args = parser.parse_args() # returns data from the options specified
print(args.inverse)

# transform strings in classes:
Model = help_model(args.Model)
loop = help_loop(args.SLCWA_LCWA)
Loss = help_loss(args.Loss)

#start run with passed arguments
start_emb_train(triple_path, args.inverse, Model, args.emb_dimension, loop, args.check_name,args.emb_dict_name, batch_s=None, sub_batch=None, slice_s=None, Loss)


SyntaxError: positional argument follows keyword argument (3711663861.py, line 22)