In [1]:
!pip install pykeen

Collecting pykeen
  Downloading pykeen-1.11.1-py3-none-any.whl.metadata (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click_default_group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting more_click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.7.0-py3-none-any.whl.metadata (17 kB)
Collecting docdata>=0.0.5 (from pykeen)
  Downloading docdata-0.0.5-py3-none-any.whl.metadata (13 kB)
Collecting class_resolver>=0.6.0 (from 

In [2]:
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from sklearn.model_selection import train_test_split

INFO:pykeen.utils:Using opt_einsum


In [4]:
# Load and parse triplets
triplets = pd.read_csv("relationships.csv")['triplets'][:1000]
triplets = [tuple(triplet.split(" ; ")) for triplet in triplets]
triples_df = pd.DataFrame(triplets, columns=["head", "relation", "tail"])

# Train-test split
train_df, test_df = train_test_split(triples_df, test_size=0.2, random_state=42)

# Create TriplesFactory
training_triples_factory = TriplesFactory.from_labeled_triples(
    np.array(train_df.values.tolist(), dtype=str)
)
testing_triples_factory = TriplesFactory.from_labeled_triples(
    np.array(test_df.values.tolist(), dtype=str),
    entity_to_id=training_triples_factory.entity_to_id,
    relation_to_id=training_triples_factory.relation_to_id
)

# Train the model
result = pipeline(
    model='TransE',
    training=training_triples_factory,
    testing=testing_triples_factory,
    training_loop='slcwa',
    model_kwargs=dict(embedding_dim=64),
    training_kwargs=dict(num_epochs=10),
)

# Access trained model
model = result.model

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cuda:0:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/4.00 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.00 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.28s seconds


In [5]:
import torch

# Get mappings
entity_to_id = training_triples_factory.entity_to_id
relation_to_id = training_triples_factory.relation_to_id

# Define triplet to test
triplet_to_test = ("Jack Dempsey", "place of death", "New York City")

try:
    # Convert to indices
    head_index = entity_to_id[triplet_to_test[0]]
    relation_index = relation_to_id[triplet_to_test[1]]
    tail_index = entity_to_id[triplet_to_test[2]]

    # Create tensor
    triplet_tensor = torch.tensor([[head_index, relation_index, tail_index]], dtype=torch.long)

    # Score the triplet
    score = model.score_hrt(triplet_tensor)
    print("Triplet plausibility score:", score.item())

except KeyError as e:
    print(f"Error: The entity or relation '{e}' in your triplet was not found in the training data.")
    print("Please ensure that all entities and relations in your test triplet are present in the relationships.csv file used for training.")


Error: The entity or relation ''Jack Dempsey'' in your triplet was not found in the training data.
Please ensure that all entities and relations in your test triplet are present in the relationships.csv file used for training.


In [11]:
import pandas as pd

# Load your CSV
df = pd.read_csv('relationships.csv')

# Sample entry in df['triplets']: (Jack Dempsey, place of death, New York City)
def parse_triplet(triplet_str):
    # Remove parentheses and extra spaces
    triplet_str = triplet_str.strip("()").strip()

    # Now split by comma, but assume exactly 3 parts
    parts = triplet_str.split(",", 2)  # maxsplit=2 to preserve commas in final part

    # Strip whitespace from each part
    parts = [p.strip().lower() for p in parts]

    if len(parts) != 3:
        raise ValueError(f"Triplet parsing failed: {triplet_str}")

    return tuple(parts)

# Apply the parser
df[['head', 'relation', 'tail']] = df['triplets'].apply(parse_triplet).apply(pd.Series)

# Preview
print(df[['head', 'relation', 'tail']].head())

ValueError: Triplet parsing failed: Jan Ingenhousz ; award received ; Fellow of the Royal Society