In [5]:
! pip install --upgrade pip
! python -c "import pykeen" || pip install git+https://github.com/pykeen/pykeen.git



In [6]:
import torch
import pykeen
from pathlib import Path
import pandas as pd
from pykeen import predict
from pykeen.datasets import Nations
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

In [None]:
from rdflib import Graph, Namespace

g = Graph()
g.parse("B/B2_research_publications_abox.ttl", format="turtle")

NS = Namespace("http://example.org/research#")

predicates_to_keep = {
    NS.writes,
    NS.publishedIn,
    NS.hasKeyword,
    NS.year,
    NS.cites
}

with open("../C/C_filtered_data.tsv", "w") as f:
    for s, p, o in g:
        if p in predicates_to_keep:
            f.write(f"{s}\t{p}\t{o}\n")


In [None]:
#from rdflib import Graph
#g = Graph()
#g.parse("../B/B2_research_publications_abox.ttl", format="ttl")
#with open("../B/B2_research_publications_abox.tsv", "w", encoding="utf-8") as f:
#    f.write("subject\tpredicate\tobject\n")  # Header row
#    for subj, pred, obj in g:
#        f.write(f"{subj}\t{pred}\t{obj}\n")


In [19]:
file_path = '../C/C_filtered_data.tsv'
tf = TriplesFactory.from_path(file_path)
training, testing = tf.split(ratios=[0.8, 0.2])
device = "cuda" if torch.cuda.is_available() else "cpu"

result = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(
        num_epochs=20
    ),
    optimizer_kwargs=dict(
        lr=0.01,
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1,
    ),
    random_seed=2025,
    device = device
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [5190, 3220]
INFO:pykeen.pipeline.api:Using device: cpu
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cpu:   0%|          | 0/20 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0.00/51.0 [00:00<?, ?batch/s]



Evaluating on cpu:   0%|          | 0.00/3.22k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 6.69s seconds


In [20]:
evaluation = result.metric_results.to_dict()
evaluation['both']['realistic']

{'variance': 4862940.5,
 'harmonic_mean_rank': 54.23564680245406,
 'inverse_median_rank': 0.001254705130122602,
 'inverse_harmonic_mean_rank': 0.018438057973980904,
 'inverse_arithmetic_mean_rank': 0.0005929255275987089,
 'z_inverse_harmonic_mean_rank': 107.50707803293896,
 'geometric_mean_rank': 481.8456115722656,
 'adjusted_arithmetic_mean_rank_index': 0.6510328387048618,
 'count': 6440.0,
 'standard_deviation': 2205.20751953125,
 'adjusted_inverse_harmonic_mean_rank': 0.01744606289989593,
 'median_rank': 797.0,
 'z_arithmetic_mean_rank': 90.47696527498167,
 'adjusted_arithmetic_mean_rank': 0.3491019195164068,
 'median_absolute_deviation': 1107.5038572236847,
 'z_geometric_mean_rank': 69.56442768821654,
 'inverse_geometric_mean_rank': 0.0020753536373376846,
 'arithmetic_mean_rank': 1686.552490234375,
 'adjusted_geometric_mean_rank_index': 0.8647514730661274,
 'hits_at_1': 0.0027950310559006213,
 'hits_at_3': 0.011180124223602485,
 'hits_at_5': 0.021739130434782608,
 'hits_at_10': 0.0

In [21]:
evaluation = result.metric_results.to_dict()
evaluation['both']['realistic']

{'variance': 4862940.5,
 'harmonic_mean_rank': 54.23564680245406,
 'inverse_median_rank': 0.001254705130122602,
 'inverse_harmonic_mean_rank': 0.018438057973980904,
 'inverse_arithmetic_mean_rank': 0.0005929255275987089,
 'z_inverse_harmonic_mean_rank': 107.50707803293896,
 'geometric_mean_rank': 481.8456115722656,
 'adjusted_arithmetic_mean_rank_index': 0.6510328387048618,
 'count': 6440.0,
 'standard_deviation': 2205.20751953125,
 'adjusted_inverse_harmonic_mean_rank': 0.01744606289989593,
 'median_rank': 797.0,
 'z_arithmetic_mean_rank': 90.47696527498167,
 'adjusted_arithmetic_mean_rank': 0.3491019195164068,
 'median_absolute_deviation': 1107.5038572236847,
 'z_geometric_mean_rank': 69.56442768821654,
 'inverse_geometric_mean_rank': 0.0020753536373376846,
 'arithmetic_mean_rank': 1686.552490234375,
 'adjusted_geometric_mean_rank_index': 0.8647514730661274,
 'hits_at_1': 0.0027950310559006213,
 'hits_at_3': 0.011180124223602485,
 'hits_at_5': 0.021739130434782608,
 'hits_at_10': 0.0

In [22]:
result.training.relation_to_id

{'http://example.org/research#cites': 0,
 'http://example.org/research#hasKeyword': 1,
 'http://example.org/research#publishedIn': 2,
 'http://example.org/research#writes': 3,
 'http://example.org/research#year': 4}

In [23]:
result.training.entity_to_id

{'1887': 0,
 '1890': 1,
 '1921': 2,
 '1946': 3,
 '1949': 4,
 '1951': 5,
 '1952': 6,
 '1953': 7,
 '1955': 8,
 '1956': 9,
 '1957': 10,
 '1959': 11,
 '1960': 12,
 '1961': 13,
 '1962': 14,
 '1964': 15,
 '1965': 16,
 '1966': 17,
 '1967': 18,
 '1968': 19,
 '1969': 20,
 '1970': 21,
 '1971': 22,
 '1972': 23,
 '1973': 24,
 '1974': 25,
 '1975': 26,
 '1976': 27,
 '1978': 28,
 '1979': 29,
 '1980': 30,
 '1981': 31,
 '1982': 32,
 '1983': 33,
 '1984': 34,
 '1985': 35,
 '1986': 36,
 '1987': 37,
 '1988': 38,
 '1989': 39,
 '1990': 40,
 '1991': 41,
 '1992': 42,
 '1993': 43,
 '1994': 44,
 '1995': 45,
 '1996': 46,
 '1997': 47,
 '1998': 48,
 '1999': 49,
 '2000': 50,
 '2001': 51,
 '2002': 52,
 '2003': 53,
 '2004': 54,
 '2005': 55,
 '2006': 56,
 '2007': 57,
 '2008': 58,
 '2009': 59,
 '2010': 60,
 '2011': 61,
 '2012': 62,
 '2013': 63,
 '2014': 64,
 '2015': 65,
 '2016': 66,
 '2017': 67,
 '2018': 68,
 '2019': 69,
 '2020': 70,
 '2021': 71,
 '2022': 72,
 '2023': 73,
 '2024': 74,
 '2025': 75,
 'http://example.org/r

In [24]:

model_name = "KGE_B2_research_publications_model1"
result.save_to_directory(model_name)


INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=9703, num_relations=5, create_inverse_triples=False, num_triples=12878, path="/Users/ceciliaperez/Documents/UPC-MD/Semestre 4/SDM/P2_sdm/sdm_knowledge_graphs/C/C_filtered_data.tsv") to file:///Users/ceciliaperez/Documents/UPC-MD/Semestre%204/SDM/P2_sdm/sdm_knowledge_graphs/C/KGE_B2_research_publications_model1/training_triples
INFO:pykeen.pipeline.api:Saved to directory: /Users/ceciliaperez/Documents/UPC-MD/Semestre 4/SDM/P2_sdm/sdm_knowledge_graphs/C/KGE_B2_research_publications_model1
