In [16]:
# create one epoche, safe and evaluate manuel . after that, start again

import pykeen
from pykeen.pipeline import pipeline
import networkx as nx
import pathlib
from random import sample
import pandas as pd
from pykeen.triples import TriplesFactory
import torch
import numpy as np
from pykeen.models import TransE


In [None]:
# it is possible to continue training automaticly by simply reload the code and the pipeline will catch the latest saved CHECPOINT and will resume.
# that includes increasing the num_epochs variable manually -> the latest checkpoint from the code will be load automatically
# instead, loading models manually from checkpoints (for investigation) 

# avoid loading triples by array, think about way to save them in a path variable to avoid long preprocess and graph loading time
# check dimension
# check split ratio 

# NOTE: checpoints when bringing your own data: "When continuing the training or in general using the model after resuming training, it is critical that the entity label to identifier (entity_to_id) and relation label to identifier (relation_to_id) mappings are the same as the ones that were used when saving the checkpoint. If they are not, then any downstream usage will be nonsense."
# checkpoints also possible if trqaining is not in pipeline

In [17]:
# load Data 
from pykeen.datasets import CoDExMedium
dataset = CoDExMedium
'''evaluator = 'rankbased',
    stopper = 'early',
    stopper_kwargs=dict(frequency=5, patience=2, relative_delta=0.002)'''

In [2]:
# 2 load the graph data and preprocess
# from https://github.com/nebw/ehrgraphs/blob/master/ehrgraphs/data/data.py#L82-L117
def preprocess_graph_heterogeneous(graph: nx.Graph):
    edge_types = []
    for u, v, data in graph.edges.data():
        edge_types.append(data["edge_type"])

    edge_codes, edge_types = pd.factorize(edge_types)

    node_types = []
    for n, data in graph.nodes.data():
        node_types.append(data["node_type"])

    node_codes, node_types = pd.factorize(node_types)

    preprocessed_graph = nx.DiGraph()
    preprocessed_graph.add_nodes_from(graph.nodes())

    preprocessed_graph.node_codes = node_codes
    preprocessed_graph.node_types = node_types
        
    # drop shortcut edges
    exclude_codes = []
    exclude_codes.append(edge_codes[list(edge_types).index("Subsumes")])
    exclude_codes.append(edge_codes[list(edge_types).index("Is a")])

    for (u, v, w), c in zip(graph.edges.data("edge_weight"), edge_codes):
        assert w is not None

        # drop shortcut edges
        if c in exclude_codes and w < 1.0:
            continue

        preprocessed_graph.add_edge(u, v, edge_weight=w, edge_code=c)

    preprocessed_graph.edge_types = edge_types
    
    

    return preprocessed_graph



In [3]:
#2 load graph data 
#loading the full graph
base_path = pathlib.Path(
    "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/2_datasets_pre/211110_anewbeginning")
G = nx.readwrite.gpickle.read_gpickle('/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/2_datasets_pre/211110_anewbeginning/graph_full_211122.p')


In [4]:
#2 preprocess now 
PG = preprocess_graph_heterogeneous(G)

In [5]:
#2 create Triple Array
tripleList=[]
nodes=[]
for u,v,data in PG.edges.data():
    l=[]
    l.append(u)
    nodes.append(u)
    l.append(data['edge_code'])
    l.append(v)
    nodes.append(v)
    tripleList.append(l)

#needs triples as ndarray - shape (n,3), dtype:str 
tripleArray=np.array(tripleList, dtype=str)
print(len(tripleArray))

28842781


In [None]:
#####################
# Easy Pipeline Way #
####################


# --

# Pick a model
from pykeen.models import TransE

for i in range(1,5):
    results = pipeline(
        dataset = dataset,
        loss='marginranking',
        loss_kwargs=dict(margin=1),
        model = TransE,
        model_kwargs=dict(embedding_dim=50),
        training_kwargs=dict(
            num_epochs=i,
            checkpoint_name='my_3nd_attempt',
            checkpoint_frequency=0, ),
        training_loop = 'sLCWA',
        negative_sampler = 'basic'
    )
    losses = results.losses
    print(losses)
    if i > 1 and (losses[-2] - losses[-1]) < 0.001:
        break

#results.plot_losses()
#results.save_to_directory('single Epoch')

In [45]:
# try to reach the losses

losses = results.losses
print(type(losses), losses)

print(results.training_loop.losses_per_epochs)
print(results.training_loop.)

<class 'list'> [0.0033517409174162126, 0.0022412704176836586, 0.0016890166910126432]
[0.0033517409174162126, 0.0022412704176836586, 0.0016890166910126432]
MarginRankingLoss(
  (margin_activation): ReLU()
)


In [56]:
# lets try out a defragmented pipeline with their dataset
dataset2 = CoDExMedium()
training_triples_factory = dataset2.training

# Pick a model
model = TransE(triples_factory=training_triples_factory, embedding_dim=64)

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())

# Pick a training approach !! contains the losses
from pykeen.training import SLCWATrainingLoop 
training_loop = SLCWATrainingLoop(

    model=model,

    triples_factory=training_triples_factory,

    optimizer=optimizer,

)

# just run for one epoch, evaluate losses and restart training where it was left
for i in range(1,5):
    # Train like Cristiano Ronaldo
    losses = training_loop.train(

        triples_factory=training_triples_factory,
        num_epochs=i,
        batch_size=256,
        checkpoint_name= 'own_pipeline2.pt',
        checkpoint_frequency=0

    )
    if i>1 and (losses[-2] - losses[-1]) < 0.002:
        break



INFO:pykeen.training.training_loop:=> no checkpoint found at '/home/tilingl/.data/pykeen/checkpoints/own_pipeline2.pt'. Creating a new file.


Training epochs on cpu:   0%|          | 0/1 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/725 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 1.
INFO:pykeen.training.training_loop:=> loading checkpoint '/home/tilingl/.data/pykeen/checkpoints/own_pipeline2.pt'
INFO:pykeen.training.training_loop:=> loaded checkpoint '/home/tilingl/.data/pykeen/checkpoints/own_pipeline2.pt' stopped after having finished epoch 1


Training epochs on cpu:  50%|#####     | 1/2 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/725 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 2.


In [6]:
#2 Try out the defragmented version - first create triple factory
'''When continuing the training or in general using the model after resuming training, it is critical that the entity label to
identifier (entity_to_id) and relation label to identifier (relation_to_id) mappings are the same as the ones that were used 
when saving the checkpoint. If they are not, then any downstream usage will be nonsense. '''
tf = TriplesFactory.from_labeled_triples(tripleArray, create_inverse_triples=True)

training_factory, testing_factory = tf.split([1.0, 0.0])
#entity_mapping = tf.entity_to_id
#relation_mapping = tf.relation_to_id
print(training_factory.num_entities, training_factory.num_relations)

#training = TriplesFactory.from_labeled_triples(training_factory, create_inverse_triples= True, )
#
#testing = TriplesFactory.from_labeled_triples(testing_factory, create_inverse_triples=True,
#                                              entity_to_id=train.entity_to_id,
 #                                             relation_to_id=train.relation_to_id,)
                                              

# Pick a model
model2 = TransE(triples_factory=training_factory, embedding_dim=2)

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer2 = Adam(params=model2.get_grad_params())

# Pick a training approach !! contains the losses
from pykeen.training import SLCWATrainingLoop 
training_loop2 = SLCWATrainingLoop(

    model=model2,

    triples_factory=training_factory,

    optimizer=optimizer2,

)

# just run for one epoch, evaluate losses and restart training where it was left
'''for i in range(1,2):
    # Train like Cristiano Ronaldo
    losses2 = training_loop2.train(

        triples_factory=training_factory,
        num_epochs=i,
        batch_size=256,
        checkpoint_name= 'fooling.pt',
        checkpoint_frequency=0

    )
    if i>1 and (losses2[-2] - losses2[-1]) < 0.002:
        break
'''


using automatically assigned random_state=1836517715
  actual_ratio = actual_size / exp_size * exp_ratio
No cuda devices were available. The model runs on CPU
No random seed is specified. This may lead to non-reproducible results.


511291 326


"for i in range(1,2):\n    # Train like Cristiano Ronaldo\n    losses2 = training_loop2.train(\n\n        triples_factory=training_factory,\n        num_epochs=i,\n        batch_size=256,\n        checkpoint_name= 'fooling.pt',\n        checkpoint_frequency=0\n\n    )\n    if i>1 and (losses2[-2] - losses2[-1]) < 0.002:\n        break\n"

In [None]:
print(losses2)
print(losses2)
print(model2.num_entities, model2.entity_representations[0])

In [60]:
################################################
# Acsessing the model from the saved checkpoint#
# 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3#
from pykeen.constants import PYKEEN_CHECKPOINTS

checkpoint = torch.load(PYKEEN_CHECKPOINTS.joinpath('/home/tilingl/.data/pykeen/checkpoints/own_pipeline2.pt')) # CUDA problems
print(type(checkpoint)) # checkpoint['entity_to_id_dict']) 

# checkpoint contains model and entity_to_id, realtion_to_id mapping that was used
# load these into Pykeen:
'''train_tf = TriplesFactory.from_labeled_triples(
    triples = tripleArray, 
    entity_to_id=checkpoint['entity_to_id_dict'],
    relation_to_id=checkpoint['relation_to_id_dict'],
)'''


train_tf = TriplesFactory.from_path(
    path= '/home/tilingl/.data/pykeen/datasets/codexmedium/train.txt',
    entity_to_id=checkpoint['entity_to_id_dict'],
    relation_to_id=checkpoint['relation_to_id_dict'],
)
    
# load the model and pass the train triple factory to the model
used_model = TransE(triples_factory=train_tf)
#used_model.load_state_dict(checkpoint['model_state_dict'])

# get the embeddings
# class RepresentationModule
entity_RepModel = used_model.entity_representations[0] # check for more representations
print(entity_RepModel)

print(train_tf.num_entities)
# accses over checkpoint: 
checkp_entities_val = list(checkpoint['entity_to_id_dict'].values())
# Q100', 'Q1000', 'Q100005', 'Q1000051'

embeddings = entity_RepModel(torch.tensor(checkp_entities_val, dtype=torch.int)) # cast list elements into tensors
embedding_dict = dict(zip(checkpoint['entity_to_id_dict'].keys(), embeddings.detach().numpy() )) #  Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
print(embedding_dict['Q100005'])



<class 'dict'>


No cuda devices were available. The model runs on CPU
No random seed is specified. This may lead to non-reproducible results.


Embedding(
  (_embeddings): Embedding(17050, 50)
)
17050
[-0.13400671 -0.12385295  0.19806443  0.09177307  0.15803958 -0.16651738
 -0.06492405  0.21340628  0.18431835  0.19797724  0.21336973  0.1774984
  0.09119363 -0.04669401  0.13973255 -0.1246407   0.19583163  0.07195732
  0.13446353  0.2295184   0.04414574  0.0688907  -0.07721438  0.12669078
  0.08116424  0.0330678  -0.05262972  0.12321129  0.138985    0.06542039
  0.07272934 -0.18556963 -0.1421097   0.05704216  0.04917973 -0.04737845
  0.2461231   0.11587673  0.18489233 -0.08991764 -0.09038517 -0.07949767
 -0.17118451  0.19986464  0.05352865 -0.1716157   0.06938639 -0.17834206
  0.22943144 -0.19793965]


NameError: name 'TransE' is not defined

In [20]:
################################################
# Creating Big - Model from checkpoint         #
# 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3 # 3#
from pykeen.constants import PYKEEN_CHECKPOINTS

checkpoint2 = torch.load(PYKEEN_CHECKPOINTS.joinpath('/home/tilingl/.data/pykeen/checkpoints/gpu_run_own_big.pt'), map_location={'cuda:0':'cpu'}) 
print(type(checkpoint2)) # checkpoint['entity_to_id_dict']) 

# checkpoint contains model and entity_to_id, realtion_to_id mapping that was used
# load these into Pykeen:
train_tf2 = TriplesFactory.from_labeled_triples(
    triples = tripleArray, 
    entity_to_id=checkpoint2['entity_to_id_dict'],
    relation_to_id=checkpoint2['relation_to_id_dict'],
)

    
# load the model and pass the train triple factory to the model
used_model2 = TransE(
    triples_factory=train_tf2,
    embedding_dim=256)
#used_model2.load_state_dict(checkpoint2['model_state_dict'])

# get the embeddings
# class RepresentationModule
entity_RepModel2 = used_model2.entity_representations[0] # check for more representations
print(entity_RepModel2)

print(train_tf2.num_entities)
# accses over checkpoint: 
checkp_entities_val2 = list(checkpoint2['entity_to_id_dict'].values())
# Q100', 'Q1000', 'Q100005', 'Q1000051'
print(list(checkpoint2['entity_to_id_dict'].keys())[0])
embeddings2 = entity_RepModel2(torch.tensor(checkp_entities_val2, dtype=torch.int)) # cast list elements into tensors
print(type(embeddings2))
embedding_dict2 = dict(zip(checkpoint2['entity_to_id_dict'].keys(), embeddings2.detach().numpy() )) #  Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
print(embedding_dict2['OMOP_1000560'])


#################################################
# check graph nodes against Triples Array #
#filter duplicates 
amount = len(list(dict.fromkeys(checkpoint2['entity_to_id_dict'].keys())))
print(len(PG.nodes()), ' == ', amount)


<class 'dict'>


No cuda devices were available. The model runs on CPU
No random seed is specified. This may lead to non-reproducible results.


Embedding(
  (_embeddings): Embedding(511291, 256)
)
511291
OMOP_1000560
<class 'torch.Tensor'>
[-0.01253013 -0.09684575  0.08089289  0.04960537  0.03288252  0.04709427
  0.01253581  0.02063313  0.00482531 -0.03269486  0.08955711 -0.09162737
 -0.01532889 -0.0669371  -0.04718629 -0.04141479  0.02702554 -0.06954572
 -0.03913069  0.09565155 -0.09373201 -0.10842036 -0.00117118  0.01456364
 -0.0466005   0.05035962  0.05272698 -0.05698881  0.01946181  0.06248292
 -0.0153423   0.02050512  0.03652842 -0.06304903  0.1050147   0.02716055
  0.0241642  -0.00425509 -0.05885952 -0.06697097  0.02672952  0.07291657
 -0.05068693  0.0986026   0.07295278 -0.03017739 -0.05532933 -0.01534762
 -0.05434519 -0.06979137  0.08505629  0.05689349  0.07588166  0.08507898
 -0.03491642  0.02018699  0.09482616 -0.09885208 -0.01385707  0.01877359
 -0.01501659  0.04815469  0.04180819 -0.10517272  0.07282256  0.03604697
 -0.09307583 -0.00435607  0.02074452  0.05039392  0.03389975 -0.10712785
  0.03156006  0.0241015  -0.

In [42]:
# validating the nodes are existent in the embedding dict
missing=[]
for node in PG.nodes():
    try:
        x = embedding_dict2[str(node)]
    except KeyError: 
        'Node is not in embedding dict'
        missing.append(node)
    else:
        'if no error occures'
        continue
    finally:
        'allways executed'
        continue

print(len(missing) , '+', train_tf2.num_entities ,'=' ,511291+len(missing), '==', len(list(PG.nodes())) )

# check with networkx the missing nodes 

print(len(PG.edges(missing)))  # only edges incident to these nodes

## add 0.00 to every missing nodes dimension
#for n in missing:
#    embedding_dict2[n] = 


#print(embedding_dict2['OMOP_1000560'])
t = np.zeros((256,), dtype=float)
print(type(t))
print(len(list(nx.isolates(PG))))

# export to csv 
df = pd.DataFrame(missing)
###df.to_csv('isolated_nodes.csv')

38581 + 511291 = 549872 == 549872
0
<class 'numpy.ndarray'>
38581


In [None]:
print(len(tripleArray))

In [None]:
# out of : https://pykeen.readthedocs.io/en/stable/_modules/pykeen/training/training_loop.html#TrainingLoop

# the training_loop class has following functinos and source code attributes
result = self._train(
                num_epochs=num_epochs,
                batch_size=batch_size,
                slice_size=slice_size,
                label_smoothing=label_smoothing,
                sampler=sampler,
                continue_training=continue_training,
                only_size_probing=only_size_probing,
                use_tqdm=use_tqdm,
                use_tqdm_batch=use_tqdm_batch,
                tqdm_kwargs=tqdm_kwargs,
                stopper=stopper,
                result_tracker=result_tracker,
                sub_batch_size=sub_batch_size,
                num_workers=num_workers,
                save_checkpoints=save_checkpoints,
                checkpoint_path=checkpoint_path,
                checkpoint_frequency=checkpoint_frequency,
                checkpoint_on_failure_file_path=checkpoint_on_failure_file_path,
                best_epoch_model_file_path=best_epoch_model_file_path,
                last_best_epoch=last_best_epoch,
                drop_last=drop_last,
                callbacks=callbacks,
                gradient_clipping_max_norm=gradient_clipping_max_norm,
                gradient_clipping_norm_type=gradient_clipping_norm_type,
                gradient_clipping_max_abs_value=gradient_clipping_max_abs_value,
                triples_factory=triples_factory,
                training_instances=training_instances,
            )