# Librairies

In [1]:
import pandas as pd
import numpy as np
import gzip
import networkx as nx
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

1.13.1


In [3]:
print(torch_geometric.__version__)

2.4.0


# ORE

In [4]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [5]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 8194, # Nodes: 6654, # Edges: 8194
# Test - Triplets: 2342, # Nodes: 3052, # Edges: 2342

GraphSAGE:
Epoch: 0, Loss: 15.8869
Epoch: 50, Loss: 0.2226
Epoch: 100, Loss: 0.2173
Epoch: 150, Loss: 0.2129
Epoch: 200, Loss: 0.2115
Precision: 0.6715
Recall: 0.8083
F1-Score: 0.7336

GAT:
Epoch: 0, Loss: 254.7015
Epoch: 50, Loss: 0.2215
Epoch: 100, Loss: 0.2126
Epoch: 150, Loss: 0.2081
Epoch: 200, Loss: 0.2057
Precision: 0.6668
Recall: 0.9927
F1-Score: 0.7977

Running... _train_ORE2 _test_ORE2
# Train - Triplets: 8204, # Nodes: 6650, # Edges: 8204
# Test - Triplets: 2344, # Nodes: 3102, # Edges: 2344

GraphSAGE:
Epoch: 0, Loss: 39.8058
Epoch: 50, Loss: 0.2223
Epoch: 100, Loss: 0.2182
Epoch: 150, Loss: 0.2156
Epoch: 200, Loss: 0.2135
Precision: 0.6727
Recall: 0.5981
F1-Score: 0.6332

GAT:
Epoch: 0, Loss: 295.9423
Epoch: 50, Loss: 0.2273
Epoch: 100, Loss: 0.2145
Epoch: 150, Loss: 0.2094
Epoch: 200, Loss: 0.2052
Precision: 0.6671
Recall: 0.9834


# OWL2Bench

In [6]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [7]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 105, # Nodes: 113, # Edges: 105
# Test - Triplets: 30, # Nodes: 45, # Edges: 30

GraphSAGE:
Epoch: 0, Loss: 20.3852
Epoch: 50, Loss: 0.1075
Epoch: 100, Loss: 0.0635
Epoch: 150, Loss: 0.0481
Epoch: 200, Loss: 0.0335
Precision: 0.4545
Recall: 0.1667
F1-Score: 0.2439

GAT:
Epoch: 0, Loss: 149.1002
Epoch: 50, Loss: 0.0901
Epoch: 100, Loss: 0.0349
Epoch: 150, Loss: 0.0122
Epoch: 200, Loss: 0.0016
Precision: 0.7895
Recall: 0.5000
F1-Score: 0.6122

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 105, # Nodes: 115, # Edges: 105
# Test - Triplets: 30, # Nodes: 43, # Edges: 30

GraphSAGE:
Epoch: 0, Loss: 33.1489
Epoch: 50, Loss: 0.1263
Epoch: 100, Loss: 0.0766
Epoch: 150, Loss: 0.0733
Epoch: 200, Loss: 0.0392
Precision: 0.5000
Recall: 0.3000
F1-Score: 0.3750

GAT:
Epoch: 0, Loss: 251.9325
Epoch: 50, Loss: 0.0818
Epoch: 100, Loss: 0.0385
Epoch: 150, Loss: 0.0217
Epoch: 200, Loss: 0.0141
Precision: 0.6897
Recall: 0

# CaLiGraph

In [8]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [9]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GraphSAGE')
    model._eval(g_test)
    print()

    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test)
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 59956, # Nodes: 10311, # Edges: 59956
# Test - Triplets: 17132, # Nodes: 7866, # Edges: 17132

GraphSAGE:
Epoch: 0, Loss: 49.5768
Epoch: 50, Loss: 0.2233
Epoch: 100, Loss: 0.1904
Epoch: 150, Loss: 0.1869
Epoch: 200, Loss: 0.1858
Precision: 0.6265
Recall: 0.1422
F1-Score: 0.2318

GAT:
Epoch: 0, Loss: 369.5024
Epoch: 50, Loss: 0.2227
Epoch: 100, Loss: 0.2192
Epoch: 150, Loss: 0.2181
Epoch: 200, Loss: 0.2165
Precision: 0.6571
Recall: 0.6901
F1-Score: 0.6732

Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 96273, # Nodes: 75195, # Edges: 96273
# Test - Triplets: 27508, # Nodes: 26675, # Edges: 27508

GraphSAGE:
Epoch: 0, Loss: 21.4759
Epoch: 50, Loss: 0.1517
Epoch: 100, Loss: 0.1225
Epoch: 150, Loss: 0.1137
Epoch: 200, Loss: 0.1097
Precision: 0.8162
Recall: 0.6387
F1-Score: 0.7166

GAT:
Epoch: 0, Loss: 142.9769
Epoch: 50, Loss: 0.1257
Epoch: 100, Loss: 0.1068
Epoch: 150, Loss: 0.0990
Epoch: 200, Loss: 0.095