# Librairies

In [1]:
import pandas as pd
import numpy as np
import gzip
import networkx as nx
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

1.13.1


In [3]:
print(torch_geometric.__version__)

2.4.0


# ORE

In [4]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [5]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 53048, # Nodes: 8060, # Edges: 53048
# Test - Triplets: 15157, # Nodes: 5847, # Edges: 15157

GraphSAGE:
Epoch: 0, Loss: 71.6715
Epoch: 50, Loss: 0.1873
Epoch: 100, Loss: 0.2405
Epoch: 150, Loss: 0.1640
Epoch: 200, Loss: 0.1553
Precision: 0.8063
Recall: 0.6041
F1-Score: 0.6907

GAT:
Epoch: 0, Loss: 651.8486
Epoch: 50, Loss: 0.1575
Epoch: 100, Loss: 0.1262
Epoch: 150, Loss: 0.1170
Epoch: 200, Loss: 0.1125
Precision: 1.0000
Recall: 0.0020
F1-Score: 0.0040

Running... _train_ORE2 _test_ORE2
# Train - Triplets: 53081, # Nodes: 8064, # Edges: 53081
# Test - Triplets: 15166, # Nodes: 5845, # Edges: 15166

GraphSAGE:
Epoch: 0, Loss: 68.9358
Epoch: 50, Loss: 0.2077
Epoch: 100, Loss: 0.1614
Epoch: 150, Loss: 0.1456
Epoch: 200, Loss: 0.1400
Precision: 0.8671
Recall: 0.2160
F1-Score: 0.3459

GAT:
Epoch: 0, Loss: 562.7711
Epoch: 50, Loss: 0.1939
Epoch: 100, Loss: 0.1491
Epoch: 150, Loss: 0.1309
Epoch: 200, Loss: 0.1223
Precision: 0.7405
Recall:

# OWL2Bench

In [6]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [7]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 7989, # Nodes: 3633, # Edges: 7989
# Test - Triplets: 2283, # Nodes: 1836, # Edges: 2283

GraphSAGE:
Epoch: 0, Loss: 79.2252
Epoch: 50, Loss: 0.0703
Epoch: 100, Loss: 0.0344
Epoch: 150, Loss: 0.0374
Epoch: 200, Loss: 0.0259
Precision: 0.9210
Recall: 0.7963
F1-Score: 0.8541

GAT:
Epoch: 0, Loss: 625.2977
Epoch: 50, Loss: 0.0887
Epoch: 100, Loss: 0.0412
Epoch: 150, Loss: 0.0321
Epoch: 200, Loss: 0.0283
Precision: 1.0000
Recall: 0.0285
F1-Score: 0.0554

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 15526, # Nodes: 7080, # Edges: 15526
# Test - Triplets: 4437, # Nodes: 3612, # Edges: 4437

GraphSAGE:
Epoch: 0, Loss: 22.6412
Epoch: 50, Loss: 0.1954
Epoch: 100, Loss: 0.0337
Epoch: 150, Loss: 0.0122
Epoch: 200, Loss: 0.0099
Precision: 0.9863
Recall: 0.0971
F1-Score: 0.1769

GAT:
Epoch: 0, Loss: 115.0795
Epoch: 50, Loss: 0.0586
Epoch: 100, Loss: 0.0301
Epoch: 150, Loss: 0.0189
Epoch: 200, Loss: 0.0149
Precisi

# CaLiGraph

In [8]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [9]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 51577, # Nodes: 9611, # Edges: 51577
# Test - Triplets: 14738, # Nodes: 7081, # Edges: 14738

GraphSAGE:
Epoch: 0, Loss: 59.9409
Epoch: 50, Loss: 0.2271
Epoch: 100, Loss: 0.2196
Epoch: 150, Loss: 0.2104
Epoch: 200, Loss: 0.2033
Precision: 0.6763
Recall: 0.2533
F1-Score: 0.3685

GAT:
Epoch: 0, Loss: 381.6029
Epoch: 50, Loss: 0.2128
Epoch: 100, Loss: 0.2001
Epoch: 150, Loss: 0.1876
Epoch: 200, Loss: 0.1768
Precision: 0.6978
Recall: 0.5860
F1-Score: 0.6370

Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 29973, # Nodes: 27882, # Edges: 29973
# Test - Triplets: 8565, # Nodes: 9276, # Edges: 8565

GraphSAGE:
Epoch: 0, Loss: 15.5782
Epoch: 50, Loss: 0.2200
Epoch: 100, Loss: 0.1611
Epoch: 150, Loss: 0.1509
Epoch: 200, Loss: 0.1503
Precision: 0.8152
Recall: 0.7284
F1-Score: 0.7694

GAT:
Epoch: 0, Loss: 92.8990
Epoch: 50, Loss: 0.1679
Epoch: 100, Loss: 0.1548
Epoch: 150, Loss: 0.1448
Epoch: 200, Loss: 0.1400
Pre