In [1]:
# SCRIPT TO LOAD OUR GRAPH AS SUBSET INTO PYKEN 
# GOAL: get for every node an embedding

import pykeen
from pykeen.pipeline import pipeline
import networkx as nx
import pathlib
from random import sample
import pandas as pd
from pykeen.triples import TriplesFactory
import torch
import numpy as np
import wandb


In [94]:
# ==== NOTIZEN === #
# - Pykeen embedding does not take edge_weights into account #
# - evaluation can be excluded ! check #
# - evaluation takes up a lot of time #
# - !! skip evaluation is possible. making predicitons on the single_model is possible #
# ??- why to accses the embeddings for each node??
#    --> class RepresentationModule: "A representation module maps integer IDs to representations, which are tensors of floats."  , "We can look at all representations as a tensor of shape (max_id, *shape), and this is exactly the result of passing indices=None to the forward method."
#    - https://pykeen.readthedocs.io/en/stable/reference/nn/representation.html#pykeen.nn.emb.RepresentationModule # 
# - also "RepresentationModule" and "Embedding" class, "This class [Embedding class] provides the same interface as torch.nn.Embedding and can be used throughout PyKEEN as a more fully featured drop-in replacement" 
# - embedding class is the same as torch.nn.Embedding  - #


# TODO:
# - find a way to encounter all IDs
# - transform all IDs to their embeddings
# - transform ID to label and safe them according to their embedding in a dataframe

In [2]:
# from https://github.com/nebw/ehrgraphs/blob/master/ehrgraphs/data/data.py#L82-L117
def preprocess_graph_heterogeneous(graph: nx.Graph):
    edge_types = []
    for u, v, data in graph.edges.data():
        edge_types.append(data["edge_type"])

    edge_codes, edge_types = pd.factorize(edge_types)

    node_types = []
    for n, data in graph.nodes.data():
        node_types.append(data["node_type"])

    node_codes, node_types = pd.factorize(node_types)

    preprocessed_graph = nx.DiGraph()
    preprocessed_graph.add_nodes_from(graph.nodes())

    preprocessed_graph.node_codes = node_codes
    preprocessed_graph.node_types = node_types
        
    # drop shortcut edges
    exclude_codes = []
    exclude_codes.append(edge_codes[list(edge_types).index("Subsumes")])
    exclude_codes.append(edge_codes[list(edge_types).index("Is a")])

    for (u, v, w), c in zip(graph.edges.data("edge_weight"), edge_codes):
        assert w is not None

        # drop shortcut edges
        if c in exclude_codes and w < 1.0:
            continue

        preprocessed_graph.add_edge(u, v, edge_weight=w, edge_code=c)

    preprocessed_graph.edge_types = edge_types
    
    # --- #
    SG = nx.DiGraph()
    nodes = sample(list(preprocessed_graph.nodes()), 10000)
    SG.add_nodes_from(nodes)
    SG.add_edges_from((n, nbr, d)
        for n, nbrs in preprocessed_graph.adj.items() if n in nodes
            for nbr, d in nbrs.items() if nbr in nodes )
    
    ## --- ##

    return SG


In [4]:
#loading the full graph
base_path = pathlib.Path(
    "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/2_datasets_pre/211110_anewbeginning")
G = nx.readwrite.gpickle.read_gpickle('/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/2_datasets_pre/211110_anewbeginning/graph_full_211122.p')

# building preprocessed ego graph



In [5]:
# -------- directly create TRIPLES ------- #
#print(G['OMOP_21600001'])

SG2 = preprocess_graph_heterogeneous(G)

tripleList=[]
for u,v,data in SG2.edges.data():
    l=[]
    l.append(u)
    l.append(data['edge_code'])
    l.append(v)
    tripleList.append(l)

#needs triples as ndarray - shape (n,3), dtype:str 
tripleArray=np.array(tripleList, dtype=str)
print(type(tripleArray), tripleArray.shape, tripleArray.dtype)


<class 'numpy.ndarray'> (9682, 3) <U21


In [6]:
# ----------- directly loading Triples into PyKEEN ------ #

tf2 = TriplesFactory.from_labeled_triples(tripleArray, create_inverse_triples=True)
print(tf2)
print(tf2.get_most_frequent_relations(3))
# print(tf2.get_mask_for_relations(164)) # ERROR ??

# ----------- Training without evaluation ------------ #
'''
# Start tracking
_result_tracker.start_run(run_name=title)

#  dataset was defined by triples factories
_result_tracker.log_params(
            dict(
                dataset=USER_DEFINED_CODE,
                training=training if isinstance(training, str) else USER_DEFINED_CODE,
                testing=testing if isinstance(training, str) else USER_DEFINED_CODE,
                validation=validation if isinstance(training, str) else USER_DEFINED_CODE,
            )
        )


'''
# Pick a model
from pykeen.models import TransE

kwargs={'triples_factory': tf2, 'loss': None, 'predict_with_sigmoid':False, 'preferred_device':None, 'random_seed':None}

tf2_model = TransE(**kwargs, embedding_dim=10) # >64, 256

'''
 # Log model parameters
_result_tracker.log_params(
    params=dict(
        model=model_instance.__class__.__name__,
        model_kwargs=model_kwargs,
    ),
)
'''
# Pick an optimizer from Torch
from torch.optim import Adam

optimizer = Adam(params=tf2_model.get_grad_params())

'''
_result_tracker.log_params(
    params=dict(
        optimizer=optimizer_instance.__class__.__name__,
        optimizer_kwargs=optimizer_kwargs,
    ),
)
'''
# Pick a training approach (sLCWA or LCWA)
from pykeen.training import SLCWATrainingLoop

training_loop = SLCWATrainingLoop(

    model=tf2_model,

    triples_factory=tf2,

    optimizer=optimizer,

)
'''
_result_tracker.log_params(
        params=dict(
            negative_sampler=negative_sampler_cls.__name__,
            negative_sampler_kwargs=negative_sampler_kwargs,
        ),
)
'''

'''
_result_tracker.log_params(
        params=dict(
            training_loop=training_loop_instance.__class__.__name__,
            training_loop_kwargs=training_loop_kwargs,
        ),
)
'''
# Train like Cristiano Ronaldo

_ = training_loop.train(

    triples_factory=tf2,

    num_epochs=10, # ! ! !

    batch_size=256,
    # result_tracker='wandb'
    # result_tracker_kwargs=

)
## INTEGRATION OF WANDB AS TRACKER
from ..trackers import ResultTracker, resolve_reult_trackers
from pykeen.trackers.base.ResultTracker import WANDBResultTracker

# first perform wandb login (per github?)
_result_tracker = resolve_result_trackers(WANDBResultTracker, [dict(project='pykeen_project',
                                                                    experiment='new run',
                                                                    offline=False)]) # additional wandb.init keywords
wandb = WANDBResultTracker(project,offline=False, **kwargs)


No cuda devices were available. The model runs on CPU
No random seed is specified. This may lead to non-reproducible results.


TriplesFactory(num_entities=2348, num_relations=100, num_triples=9682, inverse_triples=True)
{32, 1, 11}


Training epochs on cpu:   0%|          | 0/50 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/78 [00:00<?, ?batch/s]

In [55]:
# ---------- directly Accsessing ebeddings from entitys and relations ----- #

print('TRIPLES', tripleArray[0])

# Obtaining entity/relation IDs from their names
## If we start with strings, we first need to convert them to IDs. For this, we can use TriplesFactory.entity_to_ids or TriplesFactory.relation_to_ids, e.g. .
## It is important to use the TriplesFactory with the same mapping that was used to train the model

#id_t = torch.as_tensor()
id_t = torch.as_tensor(tf2.entities_to_ids(["OT_ENSG00000150676", "OT_ENSG00000139547"]))
#print('TENSOR : ', id_t)

# otherwise

# mapping = tf2.entity_id_to_label({17:'strsda'})


# ------ making predictions ------ #
from pykeen.models.predict import get_relation_prediction_df
# Score top K triples

relation_pred_df = pykeen.models.predict.get_relation_prediction_df(tf2_model, 'OT_ENSG00000150676', 'OT_ENSG00000139547', triples_factory=tf2)
print('\n', 'prediction: ', '\n', relation_pred_df)

head_pred_df = pykeen.models.predict.get_head_prediction_df(tf2_model, '172', 'OT_ENSG00000139547', triples_factory=tf2)
print('\n', 'prediction: ', '\n', head_pred_df)

tail_pred_df = pykeen.models.predict.get_tail_prediction_df(tf2_model, 'OT_ENSG00000150676', '172', triples_factory=tf2)
print('\n', 'prediction: ', '\n', tail_pred_df)

Calculations will fall back to using the score_hrt method, since this model does not have a specific score_r function. This might cause the calculations to take longer than necessary.


TRIPLES ['OT_ENSG00000150676' '172' 'OT_ENSG00000139547']

 prediction:  
     relation_id relation_label     score  in_training
25           25             40 -1.524471        False
28           28             45 -1.583373        False
27           27             43 -1.793864        False
8             8             13 -2.156383        False
10           10            160 -2.219754        False
23           23             37 -2.304212        False
32           32             58 -2.319664        False
22           22             36 -2.322985        False
19           19             30 -2.443634        False
46           46             89 -2.484016        False
17           17             23 -2.497706        False
16           16             22 -2.530442        False
9             9            139 -2.538998        False
49           49             97 -2.632671        False
30           30             56 -2.680084        False
7             7            126 -2.691210        False
31     

In [40]:
# --------------- directly Exploring the EMBEDDINGS ------ #

# class RepresentationModule
entity_RepModel = tf2_model.entity_representations[0] # check for more representations
relation_RepModel =  tf2_model.relation_representations[0]
print(entity_RepModel,relation_RepModel, type(entity_RepModel))
# to get embedding(representation) from entity_RepModel use the model entity IDs

#first transform label into ID
#then pass ID as tensor into RepresentationModule to get tensor ("Get representations for indices.")
for i in range(0,10):
    embedding_x = entity_RepModel(torch.as_tensor([i]))
    print(embedding_x)
    
canonical = ((entity_RepModel.get_in_canonical_shape()).detach()).numpy() # prints out all embeddings in special shape
print('canonical: ',type(canonical))
print(canonical[0][0]) # == entity_embedding(torch.as_tensor([0])) !!!


## Using scoring function on trained model
import torch 
batch = torch.as_tensor(data=[[0,1,0]])
print(batch)
scores = tf2_model.score_hrt(batch) # score_headRelation_tail
print(scores)

print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

# get representation for indices by forward function
# 1. transform label in ID       "OT_ENSG00000139547(2101; HEAD), OMOP_4171194 (1307), OMOP_19107669(167) " 
# 2. transform ID in tensor
# 3. feed tensor into embedding to obtain values
print(canonical[0][2348-1]) # == OT_ENSG00000139547 ?
id_t = torch.as_tensor([[tf2.entities_to_ids(["OT_ENSG00000139547"]),0,0]])
# score to controlle embedding
print(id_t, type(id_t))
score = tf2_model.score_h([[id_t,0,0]])
print('score ', score)

Embedding(
  (_embeddings): Embedding(2348, 10)
) Embedding(
  (_embeddings): Embedding(100, 10)
) <class 'pykeen.nn.emb.Embedding'>
tensor([[ 0.1267,  0.2430, -0.0076, -0.0018,  0.5893,  0.5045,  0.5338,  0.0797,
          0.1757,  0.0291]], grad_fn=<ViewBackward0>)
tensor([[-0.0556,  0.3385,  0.4752, -0.0972,  0.2374,  0.1801,  0.1463,  0.0589,
         -0.3759,  0.6262]], grad_fn=<ViewBackward0>)
tensor([[-0.2338,  0.4153, -0.4783, -0.0272,  0.2879,  0.1335, -0.2701, -0.0748,
         -0.2836,  0.5327]], grad_fn=<ViewBackward0>)
tensor([[ 0.3411, -0.1479,  0.1308,  0.1032, -0.4747, -0.3076, -0.1324,  0.4253,
         -0.4835, -0.2860]], grad_fn=<ViewBackward0>)
tensor([[ 0.2588,  0.1028, -0.0235,  0.0263, -0.6016, -0.4542,  0.1166, -0.1342,
          0.5168,  0.2330]], grad_fn=<ViewBackward0>)
tensor([[ 0.2756,  0.1960,  0.4708,  0.3373,  0.1072, -0.2160, -0.4024,  0.4606,
         -0.3087,  0.1506]], grad_fn=<ViewBackward0>)
tensor([[-0.5154,  0.1769, -0.5056, -0.1610,  0.5338,  0.

TypeError: not a sequence

In [123]:
# creating a function to pass into all embeddings node by node
#  1.  of nodes    2. appropiate dataformat for storage   3. loop over nodes into labels into embeddings into storage 

# collect all nodes out of SG2 that contain an edge
nodes = [u for u,v,data in SG2.edges.data()]
for u,v,data in SG2.edges.data():
    nodes.append(v)
print(len(nodes))
# filter out duplicate nodes
nodes = list(dict.fromkeys(nodes))
print(len(nodes))

# make all entities to ids
all_entities = tf2.entities_to_ids(nodes)
print(len(all_entities), nodes[0], all_entities[0],'\n')

# use 
embedding_dict={}
c=0
#for n in nodes:
 #   embedding_dict[n] = entity_RepModel(torch.as_tensor([c]))
 #   c+=1

print( nodes[0] , '= ', tf2.entities_to_ids(['OT_ENSG00000150676']), ' = ',  entity_RepModel(torch.as_tensor([2126])) )


print('######################')
print(type(all_entities))
embeddings2 = entity_RepModel(torch.tensor(all_entities, dtype=torch.int)) # cast list elements into tensors
print(embeddings2)
embedding_dict = dict(zip(nodes, embeddings2.detach().numpy() )) #  Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
print(embedding_dict) # differences in the float numbers

19364
2348
2348 OT_ENSG00000150676 2126 

OT_ENSG00000150676 =  [2126]  =  tensor([[ 0.0752,  0.4104,  0.4334, -0.3172, -0.1718,  0.2027,  0.1089, -0.1213,
          0.1719, -0.6409]], grad_fn=<ViewBackward0>)
######################
<class 'list'>
tensor([[ 0.0752,  0.4104,  0.4334,  ..., -0.1213,  0.1719, -0.6409],
        [-0.2803, -0.1698,  0.4099,  ..., -0.1817, -0.1585,  0.6360],
        [ 0.2782,  0.0657,  0.0340,  ...,  0.5182, -0.2497, -0.0624],
        ...,
        [ 0.0726, -0.1382, -0.4844,  ..., -0.3737, -0.3753,  0.3634],
        [ 0.0169,  0.4970,  0.2144,  ..., -0.3853,  0.0291, -0.3053],
        [ 0.0177,  0.3265,  0.3937,  ...,  0.0594, -0.0596, -0.3421]],
       grad_fn=<ViewBackward0>)
{'OT_ENSG00000150676': array([ 0.07524117,  0.4103517 ,  0.43340483, -0.3172234 , -0.17181502,
        0.20272473,  0.10888627, -0.12125207,  0.1719369 , -0.6408911 ],
      dtype=float32), 'OMOP_4003506': array([-0.28029212, -0.16976763,  0.40986097, -0.11757684,  0.2925216 ,
        

In [77]:
# check for entities_to_ids bis anzahl_nodes - 1 gehen
print(tf2.num_entities, tf2.num_triples, tf2.real_num_relations)

# tensor to dataframe ?
ten = entity_RepModel(torch.as_tensor([1]))
print(ten, type(ten))
df = tf2.tensor_to_df()

2348 9682 50
tensor([[-0.0556,  0.3385,  0.4752, -0.0972,  0.2374,  0.1801,  0.1463,  0.0589,
         -0.3759,  0.6262]], grad_fn=<ViewBackward0>) <class 'torch.Tensor'>


TypeError: tensor_to_df() missing 1 required positional argument: 'tensor'