In [None]:
import torch
from torch import nn
import numpy as np
import rdflib
from rdflib import URIRef
from rdflib import RDF
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

## Loading knowledge graph

In [None]:
# reference_knowledge_graph = rdflib.Graph()
# reference_knowledge_graph.parse("data/reference-kg.nt")

## Converting to pykeen format

In [None]:
# reference_data_numpy = np.array([(s, p, o) for s, p, o in reference_knowledge_graph])
# reference_data_pykeen = TriplesFactory.from_labeled_triples(reference_data_numpy)
# reference_data_pykeen.create_inverse_triples = True

In [None]:
# to create a new model, execute the two cells above instead of this one
reference_data_pykeen = TriplesFactory.from_path_binary("transe-embeddings/training_triples")

## Learning the embeddings

In [None]:
# training, testing, validation = reference_data_pykeen.split([0.8, 0.1, 0.1])

# result = pipeline(
#     training=training,
#     testing=testing,
#     validation=validation,
#     model='TransE',
#     model_kwargs={
#         'embedding_dim': 50,
#     },
#     epochs=10
# )

In [None]:
# to create a new model, execute the cell above instead of this one
model = torch.load("transe-embeddings/trained_model.pkl")

## Creating the training/testing data

In [None]:
train = rdflib.Graph()
train.parse("data/fokg-sw-train-2024.nt")

test = rdflib.Graph()
test.parse("data/fokg-sw-test-2024.nt")

train_data = []
train_labels = []

for statement in train.subjects(RDF.type, RDF.Statement):
    
    subject = train.value(statement, RDF.subject)
    predicate = train.value(statement, RDF.predicate)
    obj = train.value(statement, RDF.object)

    subject_id = reference_data_pykeen.entity_to_id[subject.n3().strip("<>")]
    predicate_id = reference_data_pykeen.relation_to_id[predicate.n3().strip("<>")]
    obj_id = reference_data_pykeen.entity_to_id[obj.n3().strip("<>")]

    subject_tensor = model.entity_representations[0](torch.LongTensor([subject_id]))
    predicate_tensor = model.relation_representations[0](torch.LongTensor([predicate_id]))
    obj_tensor = model.entity_representations[0](torch.LongTensor([obj_id]))

    veracity_score = train.value(statement, URIRef("http://swc2017.aksw.org/hasTruthValue"))
    
    train_data.append(torch.cat((subject_tensor, predicate_tensor, obj_tensor), dim=1))
    train_labels.append(float(veracity_score))

split_index = int(len(train_data) * 0.8)

X_train = torch.stack(train_data[:split_index])
y_train = torch.Tensor(train_labels[:split_index])

X_test = torch.stack(train_data[split_index:])
y_test = torch.Tensor(train_labels[split_index:])


## Building the classifier

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

classifier = nn.Sequential(
    nn.Linear(in_features=150, out_features=300),
    nn.ReLU(),
    nn.Linear(in_features=300, out_features=100),
    nn.ReLU(),
    nn.Linear(in_features=100, out_features=1),
).to(device)

loss_function = nn.BCEWithLogitsLoss()

def accuracy_function(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100 
    return acc

optimizer = torch.optim.Adam(params=classifier.parameters(), lr=0.05)

In [None]:
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

## Training the classifier

In [None]:
epochs = 50

for epoch in range(epochs):
    # set model to training mode
    classifier.train()

    # forward pass
    y_logits = classifier(X_train).squeeze() # squeeze to remove extra dimensions
    y_pred = torch.round(torch.sigmoid(y_logits)) # logits -> prediction probabilities -> prediction labels
  
    # calculate loss and accuracy
    loss = loss_function(y_logits, y_train) 
    acc = accuracy_function(y_true=y_train, y_pred=y_pred) 

    # set gradients to zero
    optimizer.zero_grad()

    # backpropagation
    loss.backward(retain_graph=True)

    # update weights
    optimizer.step()

    # set model to testing mode
    classifier.eval()
    with torch.inference_mode():
        # predict labels
        test_logits = classifier(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # calculate loss and accuracy
        test_loss = loss_function(test_logits, y_test)
        test_acc = accuracy_function(y_true=y_test, y_pred=test_pred)

    # print stats
    print(f"Epoch: {epoch} | Training Loss: {loss:.5f}, Training Accuracy: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Accuracy: {test_acc:.2f}%")
