In [1]:
import random
import torch
import numpy as np
from rdflib import Graph, URIRef, RDF
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline


## Loading the knowledge graph

In [2]:
reference_knowledge_graph = Graph()
reference_knowledge_graph.parse("data/reference-kg.nt")

<Graph identifier=N37520aeaf52a4808bd31672a6acdc2f1 (<class 'rdflib.graph.Graph'>)>

## Converting the graph to PyKeen triples

In [3]:
# reference_data_numpy = np.array(list(reference_knowledge_graph), dtype=str)
# reference_data_pykeen = TriplesFactory.from_labeled_triples(reference_data_numpy)
# reference_data_pykeen.create_inverse_triples = True

### to create a new model, execute the lines above instead of this one (requires a lot of RAM)
reference_data_pykeen = TriplesFactory.from_path_binary("trans-e-embeddings/training_triples")

  data = dict(torch.load(path.joinpath(cls.base_file_name)))


## Learning the entity/relation embeddings

In [4]:
# training, testing, validation = reference_data_pykeen.split([0.8, 0.1, 0.1])

# result = pipeline(
#     training=training,
#     testing=testing,
#     validation=validation,
#     model='TransE',
#     model_kwargs={
#         'embedding_dim': 50
#     },
#     epochs=25
# )

# result.save_to_directory("trans-e-embeddings")
# model = result.model

### to create a new model, execute the lines above instead of this one
model = torch.load("trans-e-embeddings/trained_model.pkl", map_location=torch.device('cpu'))

  model = torch.load("trans-e-embeddings/trained_model.pkl", map_location=torch.device('cpu'))


## Creating the training/testing data

In [12]:
training_graph = Graph()
training_graph.parse("data/fokg-sw-train-2024.nt")

training_triples = []
training_labels = []

for statement in training_graph.subjects(RDF.type, RDF.Statement):
    
    subject = training_graph.value(statement, RDF.subject)
    predicate = training_graph.value(statement, RDF.predicate)
    obj = training_graph.value(statement, RDF.object)

    subject_id = reference_data_pykeen.entity_to_id[subject.n3().strip("<>")]
    predicate_id = reference_data_pykeen.relation_to_id[predicate.n3().strip("<>")]
    obj_id = reference_data_pykeen.entity_to_id[obj.n3().strip("<>")]

    subject_tensor = model.entity_representations[0](torch.LongTensor([subject_id]))
    predicate_tensor = model.relation_representations[0](torch.LongTensor([predicate_id]))
    obj_tensor = model.entity_representations[0](torch.LongTensor([obj_id]))

    veracity_score = training_graph.value(statement, URIRef("http://swc2017.aksw.org/hasTruthValue"))
    
    training_triples.append(torch.cat((subject_tensor, predicate_tensor, obj_tensor), dim=1))
    training_labels.append(float(veracity_score))

# generate 20000 additional training triples (10k true triples, 10k false triples)
for s, p, o in random.choices(list(reference_knowledge_graph), k=10000):

    subject_id = reference_data_pykeen.entity_to_id[str(s)]
    predicate_id = reference_data_pykeen.relation_to_id[str(p)]
    obj_id = reference_data_pykeen.entity_to_id[str(o)]

    subject_tensor = model.entity_representations[0](torch.LongTensor([subject_id]))
    predicate_tensor = model.relation_representations[0](torch.LongTensor([predicate_id]))
    obj_tensor = model.entity_representations[0](torch.LongTensor([obj_id]))

    training_triples.append(torch.cat((subject_tensor, predicate_tensor, obj_tensor), dim=1))
    training_labels.append(1.0)

    while True:

        replacement_choice = random.choice(["subject", "predicate", "object"])

        if replacement_choice == "subject":
            s_prime = random.choice(list(reference_knowledge_graph.subjects(unique=True)))
            if (s_prime, p, o) not in reference_knowledge_graph:
                s_prime_id = reference_data_pykeen.entity_to_id[str(s_prime)]
                s_prime_tensor = model.entity_representations[0](torch.LongTensor([s_prime_id]))
                training_triples.append(torch.cat((s_prime_tensor, predicate_tensor, obj_tensor), dim=1))
                break
        elif replacement_choice == "predicate":
            p_prime = random.choice(list(reference_knowledge_graph.predicates(unique=True)))
            if (s, p_prime, o) not in reference_knowledge_graph:
                p_prime_id = reference_data_pykeen.relation_to_id[str(p_prime)]
                p_prime_tensor = model.relation_representations[0](torch.LongTensor([p_prime_id]))
                training_triples.append(torch.cat((subject_tensor, p_prime_tensor, obj_tensor), dim=1))
                break
        elif replacement_choice == "object":
            o_prime = random.choice(list(reference_knowledge_graph.objects(unique=True)))
            if (s, p, o_prime) not in reference_knowledge_graph:
                o_prime_id = reference_data_pykeen.entity_to_id[str(o_prime)]
                o_prime_tensor = model.entity_representations[0](torch.LongTensor([o_prime_id]))
                training_triples.append(torch.cat((subject_tensor, predicate_tensor, o_prime_tensor), dim=1))
                break

    training_labels.append(0.0)

split_index = int(len(training_triples) * 0.8)

X_train = torch.stack(training_triples[:split_index])
y_train = torch.Tensor(training_labels[:split_index])

X_test = torch.stack(training_triples[split_index:])
y_test = torch.Tensor(training_labels[split_index:])


KeyboardInterrupt: 

## Building the classifier

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=150, out_features=300),
    torch.nn.ReLU(),
    torch.nn.Linear(in_features=300, out_features=100),
    torch.nn.ReLU(),
    torch.nn.Linear(in_features=100, out_features=1),
).to(device)

loss_function = torch.nn.BCEWithLogitsLoss()

def accuracy_function(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100 
    return acc

optimizer = torch.optim.Adam(params=classifier.parameters(), lr=0.01, weight_decay=0.001)

## Training the classifier

In [11]:
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

epochs = 50

for epoch in range(epochs):
    # set model to training mode
    classifier.train()

    # forward pass
    y_logits = classifier(X_train).squeeze() # squeeze to remove extra dimensions
    y_pred = torch.round(torch.sigmoid(y_logits)) # logits -> prediction probabilities -> prediction labels
  
    # calculate loss and accuracy
    loss = loss_function(y_logits, y_train)
    acc = accuracy_function(y_true=y_train, y_pred=y_pred)

    # set gradients to zero
    optimizer.zero_grad()

    # backpropagation
    loss.backward(retain_graph=True)

    # update weights
    optimizer.step()

    # set model to testing mode
    classifier.eval()
    with torch.inference_mode():

        # predict test labels
        test_logits = classifier(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))

        # calculate loss and accuracy
        test_loss = loss_function(test_logits, y_test)
        test_acc = accuracy_function(y_true=y_test, y_pred=test_pred)

    # print stats
    print(f"Epoch: {epoch} | Training Loss: {loss:.5f}, Training Accuracy: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Accuracy: {test_acc:.2f}%")


NameError: name 'X_train' is not defined

## Predicting test labels example

In [9]:
# set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# define classifier architecture
classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=150, out_features=300),
    torch.nn.ReLU(),
    torch.nn.Linear(in_features=300, out_features=100),
    torch.nn.ReLU(),
    torch.nn.Linear(in_features=100, out_features=1),
).to(device)

# load the saved model parameters
classifier.load_state_dict(torch.load('classifier/fact_checking_model.pt', map_location=torch.device('cpu')))

# load test graph
test_graph = Graph()
test_graph.parse("data/fokg-sw-test-2024.nt")

  classifier.load_state_dict(torch.load('fact_checking_model.pt', map_location=torch.device('cpu')))


<Graph identifier=N2238622ff8644a7daca6d2406858fe96 (<class 'rdflib.graph.Graph'>)>

In [10]:
test_triple_iris = []
test_triples = []
test_labels = []

# process test graph similar to training data
for statement in test_graph.subjects(RDF.type, RDF.Statement):

    subject = test_graph.value(statement, RDF.subject)
    predicate = test_graph.value(statement, RDF.predicate)
    obj = test_graph.value(statement, RDF.object)

    subject_id = reference_data_pykeen.entity_to_id[str(subject)]
    predicate_id = reference_data_pykeen.relation_to_id[str(predicate)]
    obj_id = reference_data_pykeen.entity_to_id[str(obj)]

    subject_tensor = model.entity_representations[0](torch.LongTensor([subject_id]))
    predicate_tensor = model.relation_representations[0](torch.LongTensor([predicate_id]))
    obj_tensor = model.entity_representations[0](torch.LongTensor([obj_id]))

    test_triple_iris.append(statement)
    test_triples.append(torch.cat((subject_tensor, predicate_tensor, obj_tensor), dim=1))

X_test = torch.stack(test_triples)

In [11]:
# set classifier to evaluation
classifier.eval()

test_logits = classifier(X_test).squeeze()
#test_pred = torch.round(torch.sigmoid(test_logits))
test_pred = torch.sigmoid(test_logits)

In [12]:
# save predictions as list
test_pred_as_list = [x.item() for x in list(test_pred)]

# write predictions to file
result_file = open('result.ttl', 'w')
for i in range(len(test_pred_as_list)):
    print(f"<{test_triple_iris[i]}> <http://swc2017.aksw.org/hasTruthValue> \"{test_pred_as_list[i]}\"^^<http://www.w3.org/2001/XMLSchema#double> .", file=result_file)
result_file.close()