# Librairies

In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

2.0.1+cu118


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# OWL2Bench

In [5]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [6]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 105, # Nodes/Classes: 113, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 45, # Edges: 30

GAT:
Epoch: 0, Loss: 0.3927
Epoch: 400, Loss: 0.0604
Epoch: 800, Loss: 0.0450
head, relation -> tail?
hits@1: 0.133, hits@10: 0.267
-------------------------------------------
Run time: 8 seconds, 0 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4211
Epoch: 400, Loss: 0.1903
Epoch: 800, Loss: 0.1565
head, relation -> tail?
hits@1: 0.133, hits@10: 0.267
-------------------------------------------
Run time: 8 seconds, 0 minutes

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 105, # Nodes/Classes: 115, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 43, # Edges: 30

GAT:
Epoch: 0, Loss: 0.3822
Epoch: 400, Loss: 0.0621
Epoch: 800, Loss: 0.0337
head, relation -> tail?
hits@1: 0.200, hits@10: 0.400
-------------------------------------------
Run time: 5 seconds, 0 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4263
Epoch: 400

# ORE

In [7]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [8]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 8194, # Nodes/Classes: 6654, # Edges: 8194
# Test - Triplets: 2342, # Nodes/Classes: 3052, # Edges: 2342

GAT:
Epoch: 0, Loss: 0.3249
Epoch: 400, Loss: 0.2091
Epoch: 800, Loss: 0.2064
head, relation -> tail?
hits@1: 0.018, hits@10: 0.089
-------------------------------------------
Run time: 89 seconds, 1 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3891
Epoch: 400, Loss: 0.3305
Epoch: 800, Loss: 0.2435
head, relation -> tail?
hits@1: 0.030, hits@10: 0.115
-------------------------------------------
Run time: 93 seconds, 2 minutes

Running... _train_ORE2 _test_ORE2
# Train - Triplets: 8204, # Nodes/Classes: 6650, # Edges: 8204
# Test - Triplets: 2344, # Nodes/Classes: 3102, # Edges: 2344

GAT:
Epoch: 0, Loss: 0.3074
Epoch: 400, Loss: 0.2127
Epoch: 800, Loss: 0.2082
head, relation -> tail?
hits@1: 0.003, hits@10: 0.029
-------------------------------------------
Run time: 89 seconds, 1 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3920
Epoch: 400, L

# CaLiGraph

In [9]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt',
            'file' : 'clg_10e4'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt',
            'file' : 'clg_10e5'}]

In [10]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']
    file = db_['file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()
    #torch.save(model, f'Models/subclass_reasoner/{file}_GAT')

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()
    #torch.save(model, f'Models/subclass_reasoner/{file}_TransGAT')

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 59956, # Nodes/Classes: 10311, # Edges: 59956
# Test - Triplets: 17132, # Nodes/Classes: 7866, # Edges: 17132

GAT:
Epoch: 0, Loss: 0.3406
Epoch: 400, Loss: 0.1603
Epoch: 800, Loss: 0.1599
head, relation -> tail?
hits@1: 0.020, hits@10: 0.202
-------------------------------------------
Run time: 675 seconds, 11 minutes

GAT_2hops:
Epoch: 0, Loss: 0.2158
Epoch: 400, Loss: 0.1809
Epoch: 800, Loss: 0.1829
head, relation -> tail?
hits@1: 0.538, hits@10: 0.705
-------------------------------------------
Run time: 670 seconds, 11 minutes

Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 96273, # Nodes/Classes: 75195, # Edges: 96273
# Test - Triplets: 27508, # Nodes/Classes: 26675, # Edges: 27508

GAT:
Epoch: 0, Loss: 0.3334
Epoch: 400, Loss: 0.1158
Epoch: 800, Loss: 0.1010
head, relation -> tail?
hits@1: 0.239, hits@10: 0.244
-------------------------------------------
Run time: 1399 seconds, 23 minutes

GAT_2