In [None]:
import pandas as pd
import torch
from torch import nn
import numpy as np
import rdflib
from rdflib import URIRef
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

## Loading knowledge graph

In [None]:
reference_knowledge_graph = rdflib.Graph()
reference_knowledge_graph.parse("data/reference-kg.nt")

reference_knowledge_graph_trimmed = rdflib.Graph()
reference_knowledge_graph_trimmed.parse("data/reference-kg-1000.nt")

## Converting to pykeen format

In [None]:
reference_data_numpy = np.array([(s, p, o) for s, p, o in reference_knowledge_graph])
reference_data_pykeen = TriplesFactory.from_labeled_triples(reference_data_numpy)
reference_data_pykeen.create_inverse_triples = True

## Learning the embeddings

In [None]:
training, testing, validation = reference_data_pykeen.split([0.8, 0.1, 0.1])

result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='TransE',
    model_kwargs={
        'embedding_dim': 150,
    }
)

## Creating the training/testing data

In [None]:
train = rdflib.Graph()
train.parse("data/fokg-sw-train-2024.nt")

test = rdflib.Graph()
test.parse("data/fokg-sw-test-2024.nt")

train_data = []
train_labels = []

for statement in train.subjects(URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement")):
    subject = train.value(statement, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"))
    predicate = train.value(statement, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"))
    obj = train.value(statement, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#object"))

    subject_id = reference_data_pykeen.entity_to_id[subject.n3().strip("<>")]
    predicate_id = reference_data_pykeen.entity_to_id[predicate.n3().strip("<>")]
    obj_id = reference_data_pykeen.entity_to_id[obj.n3().strip("<>")]

    subject_tensor = result.model.entity_representations[0](torch.LongTensor([subject_id]))
    predicate_tensor = result.model.relation_representations[0](torch.LongTensor([predicate_id]))
    obj_tensor = result.model.entity_representations[0](torch.LongTensor([obj_id]))

    veracity_score = train.value(statement, URIRef("http://swc2017.aksw.org/hasTruthValue"))
    
    if subject and predicate and obj and veracity_score:
        # train_data.append((subject_id, predicate_id, obj_id))
        train_data.append(torch.cat(torch.LongTensor(subject_tensor), torch.LongTensor(predicate_tensor), torch.LongTensor(obj_tensor)))
        train_labels.append(float(veracity_score))

split_index = int(len(train_data) * 0.8)

X_train = train_data[:split_index]
y_train = train_labels[:split_index]

X_test = train_data[split_index:]
y_test = train_labels[split_index:]


## Building the classifier

In [None]:
classifier = nn.Sequential(
    nn.Linear(in_features=150, out_features=300),
    nn.Linear(in_features=300, out_features=100),
    nn.Linear(in_features=100, out_features=1)
)

loss_function = nn.BCEWithLogitsLoss()

def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100 
    return acc

optimizer = torch.optim.Adam(params=classifier.parameters(), lr=0.1)

## Training the classifier

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 100

# Put data to target device
train_data, train_labels = train_data.to(device), train_labels.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    classifier.train()
    
    # 1. Forward pass (model outputs raw logits)
    y_logits = classifier(train_data).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls
  
    # 2. Calculate loss/accuracy
    # loss = loss_fn(torch.sigmoid(y_logits), # Using nn.BCELoss you need torch.sigmoid()
    #                y_train) 
    loss = loss_function(y_logits, # Using nn.BCEWithLogitsLoss works with raw logits
                   train_labels) 
    acc = accuracy_fn(y_true=train_labels, 
                      y_pred=y_pred) 

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    classifier.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = classifier(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = classifier(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")