In [1]:

import pykeen
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.constants import PYKEEN_CHECKPOINTS
import torch
import torch.nn
import argparse
from pykeen.regularizers import Regularizer, regularizer_resolver
from pykeen.utils import resolve_device

#import all models and parameter
from pykeen.models  import ConvE, TransE, ComplEx, MuRE, RotatE, TuckER, DistMult, RESCAL, NodePiece
from pykeen.training import LCWATrainingLoop, SLCWATrainingLoop
from pykeen.losses import BCEWithLogitsLoss, SoftplusLoss, NSSALoss, SoftMarginRankingLoss, PairwiseLogisticLoss

# testing:
from pykeen.datasets import Nations
from pykeen.models import NodePiece
from pykeen.datasets import FB15k237, Nations


In [2]:
# Testing NodePiece in for further experimenting
triple_path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/triples/triple_list_graph_full_v3.feather'

# inverses are necessary for the current version of NodePiece!
dataset = FB15k237(create_inverse_triples=True)
Nations_dataset = Nations(create_inverse_triples=True)
owndata = pd.read_feather(triple_path).to_numpy()
print(owndata[0])
owndataset = [owndata[i] for i in range(1000)]
owndataset = np.asarray(owndataset)
own_tf = TriplesFactory.from_labeled_triples(owndataset, create_inverse_triples=True)


# -> https://pykeen.readthedocs.io/en/stable/reference/nn/node_piece.html
#Next, we’ll use a combination of tokenizers (pykeen.nn.node_piece.AnchorTokenizer and pykeen.nn.node_piece.RelationTokenizer) 
# to replicate the full NodePiece tokenization with anchors and relational context. It’s as easy as sending a list of tokenizers to tokenizers and sending a list of arguments to num_tokens

# SIMPLE
simple_model = NodePiece(
    triples_factory=dataset.training,
    tokenizers=["AnchorTokenizer", "RelationTokenizer"],
    num_tokens=[8, 10], # default selection=32
    embedding_dim=64,
)
# now we instantiateted AnchorTokenizer with 20 anchors per node and Relation Tokenizer with 12 relations per node.
#NOTE: we found there is a saturation point around 20 anchors per node even in million-node graphs. "https://towardsdatascience.com/nodepiece-tokenizing-knowledge-graphs-6dd2b91847aa"
# 1. AnchorTokenizer has two fiels: "selection":controls how we sample anchors from the graph (32 anchors by default); "searcher":controls how we tokenize nodes using selected anchors (CSGraphAnchorSearcher by default)
# 2. -> uses scipy.sparse to compute shortest paths form all nodes in the graph to all anchors -> expensive!! for tokenaziation
# 3. ! Larger Graphs:use BFS from ScipySparseAnchorSearcher, it applies BFS by iteratively expanding node neighborhood until it finds a desired number of anchors ->> saves compute time
# 4. to replicate the full NodePiece tokenization with k anchors and m relational context.

['OMOP_21600001' '0' 'OMOP_21600509']


You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out
No random seed is specified. This may lead to non-reproducible results.


sampling:   0%|          | 0.00/14.5k [00:00<?, ?it/s]



In [None]:
# test out the triples factory
test_tf = TriplesFactory.from_labeled_triples(owndata, create_inverse_triples=False)
print(test_tf.num_relations, test_tf.num_entities)

202 700380


In [None]:
# SIMPLE 2

# try own pipeline with simple model and Nations dataset:

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model2.get_grad_params())

# Pick a training approach that was imported !! contains the losses, choose between SLCWATrainingLoop and LCWATrainingLoop
training_loop = SLCWATrainingLoop(    
    model=model2,
    triples_factory=Nations_dataset.training,
    optimizer=optimizer,
    automatic_memory_optimization=True, 
)

simple_losses = training_loop.train(
    
    triples_factory=Nations_dataset.training,
    num_epochs=4,
    batch_size=None, #256, # if None -> automatic search for the best and greatest
    checkpoint_name= "First_7NodePiece", # for example TransE_t2.pt
    checkpoint_frequency=0,
    checkpoint_directory='/sc-scratch/sc-scratch-ukb-cvd/checkpoints_pykeen_leonard', # new checkpoint dir bc of massive storage needs
    
    sub_batch_size=None, # not for SLCWA and not supported bc of batch normalization!!
    slice_size=None, # not for SLCWA
)


Training epochs on cpu:   0%|          | 0/4 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/13 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/13 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/13 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/13 [00:00<?, ?batch/s]

In [145]:
# SIMPLE 3
# TRY to accses embeddings out of simple_model nations dataset
#entit_rep=simple_model.entity_representations[0]
print(entit_rep.token_representations[0].vocabulary_size, entit_rep.token_representations[0].vocabulary,type(entit_rep.token_representations[1]))
vocab = entit_rep.token_representations[0].vocabulary
print('vocab Type:',type(vocab)) # the token representations (15x64)
# THATS IT !!
print(type(entit_rep),'aggreg:',entit_rep.aggregation, 'token_rep: ',entit_rep.token_representations ,'shape: ', entit_rep.shape)
print('######:',entit_rep([13]))
######################
# relation representation
relat_rep=simple_model.relation_representations[0]
print(type(relat_rep), relat_rep)

# check out IDS of tripleFactory!
entity_dict=Nations_dataset.training.entity_to_id
relation_dict=Nations_dataset.training.relation_to_id
print(entity_dict,'\n')
print(relation_dict)
print(entit_rep(entity_dict['china']) )

# check difference to model2
entit2_rep=model2.entity_representations[0]
print(type(entit2_rep), entit2_rep)
print('######:',entit_rep([13]))


entity_id_dict = Nations_dataset.training.entity_to_id
entity_ids_as_tensors = torch.as_tensor([int(v) for v in entity_id_dict.values()], dtype=torch.long)
entity_embeddings =entit_rep(indices=entity_ids_as_tensors).detach().numpy()

df_dict = pd.DataFrame(dict(nodes= entity_id_dict.keys(), embeddings=list(entity_embeddings)))
print(df_dict.head())

15 Embedding(
  (_embeddings): Embedding(15, 64)
) <class 'pykeen.nn.node_piece.representations.TokenizationRepresentation'>
vocab Type: <class 'pykeen.nn.representation.Embedding'>
<class 'pykeen.nn.node_piece.representations.NodePieceRepresentation'> aggreg: <built-in method mean of type object at 0x7fd2e0e14ee0> token_rep:  ModuleList(
  (0): TokenizationRepresentation(
    max_id=14,
    num_tokens=8,
    vocabulary_size=15,
    (vocabulary): Embedding(
      (_embeddings): Embedding(15, 64)
    )
  )
  (1): TokenizationRepresentation(
    max_id=14,
    num_tokens=10,
    vocabulary_size=111,
    (vocabulary): Embedding(
      (_embeddings): Embedding(111, 64)
    )
  )
) shape:  (64,)
######: tensor([[ 0.2257, -0.4834,  0.1831, -0.2359,  0.2199,  0.3202, -0.1954, -0.3852,
         -0.2191,  0.1879,  0.3445, -0.1156, -0.0418,  0.0073,  0.0801,  0.3840,
          0.0636,  0.1547,  0.3005,  0.0387,  0.0819,  0.2374,  0.4449,  0.2294,
         -0.2591, -0.1377, -0.0305, -0.0385,  0.1

In [8]:
# for 15k nodes better use 100 anchors, selected with the top degree-strategy by sending tokenizer_kwargs list:
model2 = NodePiece(
    triples_factory=dataset.training, # Nations_dataset.training
    tokenizers=["AnchorTokenizer", "RelationTokenizer"],
    num_tokens=[20, 12], # fitst Anchor, then Relation  #8,10
    tokenizers_kwargs=[
        dict(
            selection="Degree", # node selecting strategy
            selection_kwargs=dict(
                num_anchors=500, #10
            ),
            searcher="CSGraph",
        ),
        dict(),  # empty dict for the RelationTokenizer - it doesn't need any kwargs
    ],
    embedding_dim=64,
)

No random seed is specified. This may lead to non-reproducible results.


sampling:   0%|          | 0.00/14.5k [00:00<?, ?it/s]

In [4]:
# Let’s create a model with 500 top-pagerank anchors selected with the BFS strategy - we’ll just modify the selection and searcher args:

big_Dat_model= NodePiece(
    triples_factory=dataset.training,
    tokenizers=["AnchorTokenizer", "RelationTokenizer"],
    num_tokens=[20, 12],
    tokenizers_kwargs=[
        dict(
            selection="PageRank",
            selection_kwargs=dict(
                num_anchors=500,
            ),
            searcher="ScipySparse", #breadth-first search!
        ),
        dict(),  # empty dict for the RelationTokenizer - it doesn't need any kwargs
    ],
    embedding_dim=64,
)

class DeepSet(torch.nn.Module):
    def __init__(self, hidden_dim=64):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, hidden_dim),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, hidden_dim),
        )

    def forward(self, x, dim=-2):
        x = self.encoder(x).mean(dim)
        x = self.decoder(x)
        return x
# if you want to use several anchor selection strategies sequentially to select more diverse anchors use the "pykeen.nn.node_piece.MixtureAnchorSelection " class:
# from the NodePiece Paper:
"""tokenizers_kwargs=[
        dict(
            selection="MixtureAnchorSelection",    #!!
            selection_kwargs=dict(
                selections=["degree", "pagerank", "random"], #!!  last "random" is avaiable also
                ratios=[0.4, 0.4, 0.2],                 #!!
                num_anchors=500,
            ),
            searcher="ScipySparse",
        ),
"""

No random seed is specified. This may lead to non-reproducible results.
9/14505 (0.06%) do not have any anchor.


sampling:   0%|          | 0.00/14.5k [00:00<?, ?it/s]

'tokenizers_kwargs=[\n        dict(\n            selection="MixtureAnchorSelection",    #!!\n            selection_kwargs=dict(\n                selections=["degree", "pagerank", "random"], #!!  last "random" is avaiable also\n                ratios=[0.4, 0.4, 0.2],                 #!!\n                num_anchors=500,\n            ),\n            searcher="ScipySparse",\n        ),\n'

In [6]:
# build nodepiece into the pipeline:



own_NodePiece=NodePiece(
    triples_factory=dataset.training,
    tokenizers=["AnchorTokenizer", "RelationTokenizer"],
        num_tokens=[20, 12],
        tokenizers_kwargs=[
            dict(
                selection="MixtureAnchorSelection",
                selection_kwargs=dict(
                    selections=["degree", "pagerank", "random"],
                    ratios=[0.4, 0.4, 0.2],
                    num_anchors=500,
                ),
                searcher="ScipySparse",
            ),
            dict(),  # empty dict for the RelationTokenizer - it doesn't need any kwargs
        ],
        embedding_dim=1024,
        interaction="distmult",
        relation_initializer="init_phases",
        relation_constrainer="complex_normalize",
        entity_initializer="xavier_uniform_",
        aggregation=DeepSet(hidden_dim=1024),
)

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=own_NodePiece.get_grad_params())

# Pick a training approach that was imported !! contains the losses, choose between SLCWATrainingLoop and LCWATrainingLoop
training_loop = SLCWATrainingLoop(    
    model=own_NodePiece,
    triples_factory=dataset.training,
    optimizer=optimizer,
    automatic_memory_optimization=True, 
)

losses = training_loop.train(
    
    triples_factory=dataset.training,
    num_epochs=4,
    batch_size=None, #256, # if None -> automatic search for the best and greatest
    checkpoint_name= "First_33NodePiece", # for example TransE_t2.pt
    checkpoint_frequency=0,
    checkpoint_directory='/sc-scratch/sc-scratch-ukb-cvd/checkpoints_pykeen_leonard', # new checkpoint dir bc of massive storage needs
    
    sub_batch_size=None, # not for SLCWA and not supported bc of batch normalization!!
    slice_size=None, # not for SLCWA
)


"""
result = pipeline(
    dataset=own_tf,
    model=NodePiece,
    model_kwargs=dict(
        tokenizers=["AnchorTokenizer", "RelationTokenizer"],
        num_tokens=[20, 12],
        tokenizers_kwargs=[
            dict(
                selection="MixtureAnchorSelection",
                selection_kwargs=dict(
                    selections=["degree", "pagerank", "random"],
                    ratios=[0.4, 0.4, 0.2],
                    num_anchors=500,
                ),
                searcher="ScipySparse",
            ),
            dict(),  # empty dict for the RelationTokenizer - it doesn't need any kwargs
        ],
        embedding_dim=64,
        interaction="rotate",
        relation_initializer="init_phases",
        relation_constrainer="complex_normalize",
        entity_initializer="xavier_uniform_",
        aggregation=DeepSet(hidden_dim=64),
    ),
)
result.plot()
"""

No random seed is specified. This may lead to non-reproducible results.
9/14505 (0.06%) do not have any anchor.


sampling:   0%|          | 0.00/14.5k [00:00<?, ?it/s]

Training epochs on cpu:   0%|          | 0/4 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/2126 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [60]:
tensor=torch.tensor([1.]) # dimension 
print(tensor)
trainedM =own_NodePiece
print(trainedM)
moduleList=trainedM.entity_representations
#print(moduleList[0])
NodePieceRepresentation = moduleList[0]
x =NodePieceRepresentation._plain_forward()
print(x.size())
#print(NodePieceRepresentation.aggregation.forward(tensor))

tensor([1.])
NodePiece(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): RotatEInteraction()
  (entity_representations): ModuleList(
    (0): NodePieceRepresentation(
      aggregation=DeepSet(
        (encoder): Sequential(
          (0): Linear(in_features=64, out_features=64, bias=True)
          (1): ReLU()
          (2): Linear(in_features=64, out_features=64, bias=True)
          (3): ReLU()
          (4): Linear(in_features=64, out_features=64, bias=True)
        )
        (decoder): Sequential(
          (0): Linear(in_features=64, out_features=64, bias=True)
          (1): ReLU()
          (2): Linear(in_features=64, out_features=64, bias=True)
          (3): ReLU()
          (4): Linear(in_features=64, out_features=64, bias=True)
        )
      ), 
      (token_representations): ModuleList(
        (0): TokenizationRepresentation(
          max_id=14505,
          num_tokens=20,
          vocabulary_size=501,
          (vocabulary): Embedding