In [1]:

import pykeen
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.constants import PYKEEN_CHECKPOINTS
import torch
import argparse
from pykeen.regularizers import Regularizer, regularizer_resolver
from pykeen.utils import resolve_device

#import all models and parameter
from pykeen.models  import ConvE, TransE, ComplEx, MuRE, RotatE, TuckER, DistMult, RESCAL
from pykeen.training import LCWATrainingLoop, SLCWATrainingLoop
from pykeen.losses import BCEWithLogitsLoss, SoftplusLoss, NSSALoss, SoftMarginRankingLoss, PairwiseLogisticLoss

# testing:
from pykeen.datasets import Nations


In [18]:
# find base losses 
M= DistMult
print(M.loss_default)
del M

<class 'pykeen.losses.MarginRankingLoss'>


In [2]:
 # make sure, that gpu is avaiable and chosen 

triple_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/triples/triple_list_graph_full_v3.feather'
inverse=True
emb_dim=3
Loss=None
Training_loop=SLCWATrainingLoop
Model=TransE

device = 'gpu' 
_device: torch.device = resolve_device(device)
print(f"Using device: {device}", type(_device))
   
# generate Triples Factory
tripleAr = pd.read_feather(triple_path).to_numpy()
print(tripleAr[0])
tripleArray = [tripleAr[i] for i in range(4000)]
tripleArray = np.asarray(tripleArray)
print(tripleArray[0])
print('length of the triple array: ', len(tripleArray), type(tripleArray))
tf = TriplesFactory.from_labeled_triples(tripleArray, create_inverse_triples=inverse)
    
print('loading TriplesFactory done ... ', type(tf))
print(tf.num_entities, tf.num_relations)    #700380 404 old=511291 338 oldest= 511291 326
    
#pick a Model that was imported
#choose a loss Class that was imported, default =None
kwargs={'triples_factory': tf, 'loss': Loss, 'predict_with_sigmoid':False}
model = Model(**kwargs, embedding_dim=emb_dim, random_seed=420)
model= model.to(_device) # important otherwise fall back to cpu
    
# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())
# Pick a training approach that was imported !! contains the losses, choose between SLCWATrainingLoop and LCWATrainingLoop
training_loop = Training_loop(
    model=model,
    triples_factory=tf,
    optimizer=optimizer,
    automatic_memory_optimization=True, 
)

No cuda devices were available. The model runs on CPU


Using device: gpu <class 'torch.device'>
['OMOP_21600001' '0' 'OMOP_21600509']
['OMOP_21600001' '0' 'OMOP_21600509']
length of the triple array:  4000 <class 'numpy.ndarray'>
loading TriplesFactory done ...  <class 'pykeen.triples.triples_factory.TriplesFactory'>
1151 12




In [53]:
#check out the checkpoint:
#checkpoint = torch.load(PYKEEN_CHECKPOINTS.joinpath('/sc-scratch/sc-scratch-ukb-cvd/checkpoints_pykeen_leonard/TransE_Test.pt'), map_location=torch.device('cpu'))
#ent_dict=checkpoint['entity_to_id_dict']
keys=ent_dict.keys()
print(type(keys))
#for i in range(10): print(keys[i])
print(ent_dict['OMOP_765111'])
print(triples)
#print(tripleArray)

<class 'dict_keys'>
656283
[['OMOP_1000560' '17' 'OMOP_1000599']
 ['OMOP_1000560' '17' 'OMOP_1000600']
 ['OMOP_1000560' '17' 'OMOP_1000612']
 ...
 ['phecode_997' '197' 'OMOP_764563']
 ['phecode_997' '197' 'OMOP_765111']
 ['phecode_997' '199' 'phecode_997']]


In [5]:
# entity_id_mapping check
print(tf.entities_to_ids(['OMOP_1000560', 'OMOP_1000577', 'OMOP_1000579','OMOP_1000599','OMOP_1000600'])) #OMOP_1000577, OMOP_1000579, OMOP_1000599, OMOP_1000600, phecode_997
#x=tf.tensor_t_df( 
triples=tf.triples

KeyError: 'OMOP_1000560'

In [4]:
# just run for one epoch, evaluate losses and restart training where it was left
for i in range(1,20):
    if i >1:
        #make shure the loaded checkpoint is has the right mapping:
        checkpoint = torch.load(PYKEEN_CHECKPOINTS.joinpath('/sc-scratch/sc-scratch-ukb-cvd/checkpoints_pykeen_leonard/TransE_Test55.pt'))
        tf = TriplesFactory.from_labeled_triples(
            triples =tripleArray,
            create_inverse_triples=inverse,
            entity_to_id=checkpoint['entity_to_id_dict'],
            relation_to_id=checkpoint['relation_to_id_dict'],
            )
        kwargs={'triples_factory': tf, 'loss': Loss, 'predict_with_sigmoid':False}
        model = Model(**kwargs, embedding_dim=emb_dim, random_seed=None)
        model= model.to(_device) # important otherwise fall back to cpu
    
        # Pick an optimizer from Torch
        from torch.optim import Adam
        optimizer = Adam(params=model.get_grad_params())
        # Pick a training approach that was imported !! contains the losses, choose between SLCWATrainingLoop and LCWATrainingLoop
        training_loop = Training_loop(
            model=model,
            triples_factory=tf,
            optimizer=optimizer,
            automatic_memory_optimization=True, 
            )
        print("check done!") 
        # Train like Cristiano Ronaldo
    losses = training_loop.train(
        triples_factory=tf,
        num_epochs=i,
        batch_size=None, #256, # if None -> automatic search for the best and greatest
        checkpoint_name= 'TransE_Test55.pt', # for example TransE_t2.pt
        checkpoint_frequency=0,
        checkpoint_directory='/sc-scratch/sc-scratch-ukb-cvd/checkpoints_pykeen_leonard', # new checkpoint dir bc of massive storage needs
            
        sub_batch_size=None, # not for SLCWA and not supported bc of batch normalization!!
        slice_size=None, # not for SLCWA
        )
    
    if i>1 and (losses[-2] - losses[-1]) < 1e-7: # changed 1e-6 to 1e-7  
        #TODO: function to obtain embeddings
        #switch model back to cpu device:
        _device = torch.device('cpu') # 
        model.to(_device)
        #do not need anymore: entity_RepModel = model.entity_representations[0] # check for more representations
        try:
            print(model.entity_representations[1])
        except IndexError:
            
            print('\n','Index Error, no more entity_reps ', '\n')
        # acces entity_values, mapp them to list and pass the list to the entity_repModel to recieve Embeddings. Next, create embedding_dict and transform to DataFrame
        ##nodes = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/connected_nodes_list.feather')
            
        #print(tf.entity_to_id, type(tf.entity_to_id),'e_ids', tf.entity_ids, tf.entity_to_id.keys())
        #
        entity_id_dict = tf.entity_to_id
        entity_ids_as_tensors = torch.as_tensor([int(v) for v in entity_id_dict.values()], dtype=torch.long, device=_device)
            
            # casting node Names from list into equivalent ids(indices) as a torch.LongTensor on CPU --> bc model is also cpu
            ##entity_ids = torch.as_tensor(tf.entities_to_ids(nodes['nodes']), dtype=torch.long, device=_device) # .view()?
            
            #all embeddings as a numpy.ndarray, indices as torch.LongTensor
        entity_embeddings = model.entity_representations[0](indices=entity_ids_as_tensors).detach().numpy() # detach tensor 
            
        # do not need anymore : embeddings = entity_RepModel(entity_ids) 
        df_dict = pd.DataFrame(dict(nodes= entity_id_dict.keys(), embeddings=list(entity_embeddings)))
        print(df_dict.head())
        # save embedding dict
        #emb_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/embeddings_leonard/Embedding_dict_' + emb_dict_name + '.feather'
        #print('saved in: ',emb_path)
        #df_dict.to_feather(emb_path)
        break
          

Training epochs on cpu: 4epoch [00:00, ?epoch/s]

No random seed is specified. This may lead to non-reproducible results.


check done!




Training epochs on cpu: 4epoch [00:00, ?epoch/s]


 Index Error, no more entity_reps  

          nodes                              embeddings
0  OMOP_1000632   [0.46381128, -0.7611315, -0.45338494]
1  OMOP_1036059   [-0.17091985, 0.7019024, -0.69146186]
2  OMOP_1036094  [-0.8853249, -0.025909247, 0.46425048]
3  OMOP_1036228    [0.2761429, -0.48907068, -0.8273784]
4  OMOP_1036525   [-0.13714032, -0.814403, -0.56386197]
