# Librairies

In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

2.0.1+cu118


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# OWL2Bench

In [5]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [6]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 7989, # Nodes: 3633, # Edges: 7989
# Test - Triplets: 2283, # Nodes: 1836, # Edges: 2283

GAT:
Epoch: 0, Loss: 0.3118
Epoch: 400, Loss: 0.0131
Epoch: 800, Loss: 0.0115
head, relation -> tail?
hits@1: 0.307, hits@10: 0.329
-------------------------------------------
Run time: 63 seconds, 1 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4181
Epoch: 400, Loss: 0.0281
Epoch: 800, Loss: 0.0274
head, relation -> tail?
hits@1: 0.106, hits@10: 0.127
-------------------------------------------
Run time: 62 seconds, 1 minutes

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 15526, # Nodes: 7080, # Edges: 15526
# Test - Triplets: 4437, # Nodes: 3612, # Edges: 4437

GAT:
Epoch: 0, Loss: 0.4405
Epoch: 400, Loss: 0.0177
Epoch: 800, Loss: 0.0068
head, relation -> tail?
hits@1: 0.314, hits@10: 0.323
-------------------------------------------
Run time: 123 seconds, 2 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4508
Epoch: 400, Loss: 

# ORE

In [7]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [8]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 53048, # Nodes: 8060, # Edges: 53048
# Test - Triplets: 15157, # Nodes: 5847, # Edges: 15157

GAT:
Epoch: 0, Loss: 0.2640
Epoch: 400, Loss: 0.1178
Epoch: 800, Loss: 0.1135
head, relation -> tail?
hits@1: 0.031, hits@10: 0.199
-------------------------------------------
Run time: 547 seconds, 9 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3215
Epoch: 400, Loss: 0.1437
Epoch: 800, Loss: 0.1309
head, relation -> tail?
hits@1: 0.036, hits@10: 0.125
-------------------------------------------
Run time: 545 seconds, 9 minutes

Running... _train_ORE2 _test_ORE2
# Train - Triplets: 53081, # Nodes: 8064, # Edges: 53081
# Test - Triplets: 15166, # Nodes: 5845, # Edges: 15166

GAT:
Epoch: 0, Loss: 0.3930
Epoch: 400, Loss: 0.1235
Epoch: 800, Loss: 0.1092
head, relation -> tail?
hits@1: 0.016, hits@10: 0.105
-------------------------------------------
Run time: 541 seconds, 9 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3278
Epoch: 400, Loss: 0.1569
Epoch: 80

# CaLiGraph

In [9]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt',
            'file' : 'clg_10e4'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt',
            'file' : 'clg_10e5'}]

In [10]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']
    file = db_['file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    #torch.save(model, f'Models/assertion_reasoner/{file}_GAT')
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()
    #torch.save(model, f'Models/assertion_reasoner/{file}_TransGAT')

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 51577, # Nodes: 9611, # Edges: 51577
# Test - Triplets: 14738, # Nodes: 7081, # Edges: 14738

GAT:
Epoch: 0, Loss: 0.3351
Epoch: 400, Loss: 0.1571
Epoch: 800, Loss: 0.1500
head, relation -> tail?
hits@1: 0.068, hits@10: 0.189
-------------------------------------------
Run time: 562 seconds, 9 minutes

GAT_2hops:
Epoch: 0, Loss: 0.2003
Epoch: 400, Loss: 0.1831
Epoch: 800, Loss: 0.1825
head, relation -> tail?
hits@1: 0.125, hits@10: 0.293
-------------------------------------------
Run time: 562 seconds, 9 minutes

Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 29973, # Nodes: 27882, # Edges: 29973
# Test - Triplets: 8565, # Nodes: 9276, # Edges: 8565

GAT:
Epoch: 0, Loss: 0.3522
Epoch: 400, Loss: 0.1440
Epoch: 800, Loss: 0.1246
head, relation -> tail?
hits@1: 0.003, hits@10: 0.019
-------------------------------------------
Run time: 360 seconds, 6 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4206
Epoch: 400,