In [14]:
! pip install --upgrade pip
! python -c "import pykeen" || pip install git+https://github.com/pykeen/pykeen.git



In [15]:
import torch
import pykeen
import pandas as pd
from pykeen import predict
from pykeen.datasets import Nations
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

In [16]:
from rdflib import Graph

g = Graph()
g.parse("B2_research_publications_abox.ttl", format="ttl")
with open("B2_research_publications_abox.tsv", "w", encoding="utf-8") as f:
    f.write("subject\tpredicate\tobject\n")  # Header row
    for subj, pred, obj in g:
        f.write(f"{subj}\t{pred}\t{obj}\n")


In [17]:
file_path = 'B2_research_publications_abox.tsv'
tf = TriplesFactory.from_path(file_path)
training, testing = tf.split()
device = "cuda" if torch.cuda.is_available() else "cpu"

result = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(
        num_epochs=20
    ),
    optimizer_kwargs=dict(
        lr=0.01,
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1,
    ),
    random_seed=2025,
    device = device
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [27915, 12477]
INFO:pykeen.pipeline.api:Using device: cpu
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cpu:   0%|          | 0/20 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/195 [00:00<?, ?batch/s]



Evaluating on cpu:   0%|          | 0.00/12.5k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 76.50s seconds


In [18]:
evaluation = result.metric_results.to_dict()
evaluation['both']['realistic']

{'standard_deviation': 5326.91552734375,
 'adjusted_inverse_harmonic_mean_rank': 0.054702501270861806,
 'median_rank': 1149.0,
 'adjusted_arithmetic_mean_rank_index': 0.766989898347425,
 'z_arithmetic_mean_rank': 209.29264369724882,
 'median_absolute_deviation': 1687.201324659375,
 'z_geometric_mean_rank': 150.15800581909184,
 'adjusted_arithmetic_mean_rank': 0.23306387193856312,
 'inverse_geometric_mean_rank': 0.0018918965943157673,
 'arithmetic_mean_rank': 3324.46875,
 'variance': 28376030.0,
 'harmonic_mean_rank': 18.160871211575362,
 'inverse_median_rank': 0.0008703219937160611,
 'adjusted_geometric_mean_rank_index': 0.9495903207284776,
 'inverse_harmonic_mean_rank': 0.05506343767046929,
 'count': 24954.0,
 'inverse_arithmetic_mean_rank': 0.0003007999330293387,
 'z_inverse_harmonic_mean_rank': 1135.5436723651328,
 'geometric_mean_rank': 528.5701293945312,
 'hits_at_1': 0.007012903742886912,
 'hits_at_3': 0.08171034703855093,
 'hits_at_5': 0.10459245010819909,
 'hits_at_10': 0.13308

In [19]:
evaluation = result.metric_results.to_dict()
evaluation['both']['realistic']

{'standard_deviation': 5326.91552734375,
 'adjusted_inverse_harmonic_mean_rank': 0.054702501270861806,
 'median_rank': 1149.0,
 'adjusted_arithmetic_mean_rank_index': 0.766989898347425,
 'z_arithmetic_mean_rank': 209.29264369724882,
 'median_absolute_deviation': 1687.201324659375,
 'z_geometric_mean_rank': 150.15800581909184,
 'adjusted_arithmetic_mean_rank': 0.23306387193856312,
 'inverse_geometric_mean_rank': 0.0018918965943157673,
 'arithmetic_mean_rank': 3324.46875,
 'variance': 28376030.0,
 'harmonic_mean_rank': 18.160871211575362,
 'inverse_median_rank': 0.0008703219937160611,
 'adjusted_geometric_mean_rank_index': 0.9495903207284776,
 'inverse_harmonic_mean_rank': 0.05506343767046929,
 'count': 24954.0,
 'inverse_arithmetic_mean_rank': 0.0003007999330293387,
 'z_inverse_harmonic_mean_rank': 1135.5436723651328,
 'geometric_mean_rank': 528.5701293945312,
 'hits_at_1': 0.007012903742886912,
 'hits_at_3': 0.08171034703855093,
 'hits_at_5': 0.10459245010819909,
 'hits_at_10': 0.13308

In [20]:
result.training.relation_to_id

{'': 0,
 'http://example.org/research#cites': 1,
 'http://example.org/research#correspondingAuthor': 2,
 'http://example.org/research#endDate': 3,
 'http://example.org/research#hasEdition': 4,
 'http://example.org/research#hasKeyword': 5,
 'http://example.org/research#hasReview': 6,
 'http://example.org/research#heldDuring': 7,
 'http://example.org/research#heldInCity': 8,
 'http://example.org/research#publishedIn': 9,
 'http://example.org/research#reviewer': 10,
 'http://example.org/research#reviewsPaper': 11,
 'http://example.org/research#startDate': 12,
 'http://example.org/research#title': 13,
 'http://example.org/research#writes': 14,
 'http://example.org/research#year': 15,
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': 16,
 'predicate': 17}

In [21]:
result.training.entity_to_id

{'': 0,
 ' POSTCARDIOTOMY SYNDROME , ANTICOAGULANTS, AND HÆMOPERICARDIUM': 1,
 '$B_s \\rightarrow D_s^*$ Form Factors for the full $q^2$ range from Lattice QCD': 2,
 "'. L. Nee": 3,
 "'. Sallyguttormsen": 4,
 "'市客厅'的感悟--上海人民广场评析": 5,
 '(Anti-)de Sitter Black Hole Entropy and Generalized Uncertainty Principle': 6,
 '-asymmetries based on an ': 7,
 '-flavor analysis': 8,
 '02社-27-口-20 中学校における武道必修化によって期待される教育効果 : 教員の立場から(02.体育社会学,一般研究発表抄録)': 9,
 '1,1′-Binaphthol annulated perylene diimides: Aggregation-induced emission enhancement and chirality inversion': 10,
 '1247 The Doppler Velocimetry in Twin Pregnancy Complicated by Intrauterine Growth Restriction is Predictive of Neonatal Outcome|[quest]|': 11,
 '124P Phase II study of neoadjuvant concurrent chemo-immuno-radiation therapy followed by surgery and adjuvant immunotherapy for resectable stage IIIA-B N2 non-small cell lung cancer: SQUAT trial (WJOG 12119L)': 12,
 '1542P First real-world outcome data of SCLC in Germany: Data from the Cl

In [22]:

model_name = "KGE_B2_research_publications_model1"
result.save_to_directory(model_name)


INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=29570, num_relations=18, create_inverse_triples=False, num_triples=49904, path="/Users/ceciliaperez/Documents/UPC-MD/Semestre 4/SDM/P2_sdm/sdm_knowledge_graphs/B2_research_publications_abox.tsv") to file:///Users/ceciliaperez/Documents/UPC-MD/Semestre%204/SDM/P2_sdm/sdm_knowledge_graphs/KGE_B2_research_publications_model1/training_triples
INFO:pykeen.pipeline.api:Saved to directory: /Users/ceciliaperez/Documents/UPC-MD/Semestre 4/SDM/P2_sdm/sdm_knowledge_graphs/KGE_B2_research_publications_model1
