In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

2.0.1+cu118


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# OWL2Bench

In [5]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [6]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()
    
    df_train_filter = df_train[df_train['p'] == 'SubClassOf']
    g_train_filter, _, _ = create_graph(df_train_filter)
    df_test_filter = df_test[df_test['p'] == 'SubClassOf']
    g_test_filter, _, _ = create_graph(df_test_filter)

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train, g_train_filter)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test, g_test_filter)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 105, # Nodes/Classes: 113, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 45, # Edges: 30

GAT:
Epoch: 0, Loss: 0.3927
Epoch: 400, Loss: 0.0852
Epoch: 800, Loss: 0.0425
head, relation -> tail?
hits@1: 0.167, hits@10: 0.400
-------------------------------------------
Run time: 7 seconds, 0 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4211
Epoch: 400, Loss: 0.2562
Epoch: 800, Loss: 0.3192
head, relation -> tail?
hits@1: 0.000, hits@10: 0.000
-------------------------------------------
Run time: 7 seconds, 0 minutes

GAT_2hops:
+ Filter...
Epoch: 0, Loss: 0.4163
Epoch: 400, Loss: 0.2455
Epoch: 800, Loss: 0.2457
head, relation -> tail?
hits@1: 0.167, hits@10: 0.500
-------------------------------------------
Run time: 7 seconds, 0 minutes

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 105, # Nodes/Classes: 115, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 43, # Edges: 30

GAT:
Epoch: 0, Loss: 0.373

# OWL

In [7]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [8]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(device, 'GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()
    
    df_train_filter = df_train[df_train['p'] == 'SubClassOf']
    g_train_filter, _, _ = create_graph(df_train_filter)
    df_test_filter = df_test[df_test['p'] == 'SubClassOf']
    g_test_filter, _, _ = create_graph(df_test_filter)

    st = time.time()
    model = GNN()
    model._train(device, 'GAT_2hops', g_train, g_train_filter)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test, g_test_filter)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 8194, # Nodes/Classes: 6654, # Edges: 8194
# Test - Triplets: 2342, # Nodes/Classes: 3052, # Edges: 2342

GAT:
Epoch: 0, Loss: 0.3266
Epoch: 400, Loss: 0.2142
Epoch: 800, Loss: 0.2110
head, relation -> tail?
hits@1: 0.019, hits@10: 0.089
-------------------------------------------
Run time: 86 seconds, 1 minutes

GAT_2hops:
Epoch: 0, Loss: 0.4032
Epoch: 400, Loss: 0.3321
Epoch: 800, Loss: 0.3057
head, relation -> tail?
hits@1: 0.046, hits@10: 0.104
-------------------------------------------
Run time: 92 seconds, 2 minutes

GAT_2hops:
+ Filter...
Epoch: 0, Loss: 0.3495
Epoch: 400, Loss: 0.2313
Epoch: 800, Loss: 0.2364
head, relation -> tail?
hits@1: 0.072, hits@10: 0.277
-------------------------------------------
Run time: 93 seconds, 2 minutes

Running... _train_ORE2 _test_ORE2
# Train - Triplets: 8204, # Nodes/Classes: 6650, # Edges: 8204
# Test - Triplets: 2344, # Nodes/Classes: 3102, # Edges: 2344

GAT:
Epoch: 0, Loss: 0.3291
E

# CaLiGraph

In [5]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt',
            'file' : 'clg_10e4'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt',
            'file' : 'clg_10e5'}]

## CaLiGraph 10e4

In [10]:
db_ = CLG_dbs[0]
path = db_['path']
train_file = db_['train_file']
test_file = db_['test_file']

print('Running...', train_file, test_file)

df_train = load_clg_files(path+train_file)
g_train, nodes_train, edges_train = create_graph(df_train)
print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
df_test = load_clg_files(path+test_file)
g_test, nodes_test, edges_test = create_graph(df_test)
print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 127801, # Nodes: 24556, # Edges: 127801
# Test - Triplets: 36519, # Nodes: 13752, # Edges: 36519



**GAT**

In [None]:
st = time.time()
model_gat = torch.load('Models/reasoner/clg_10e4_GAT')
model_gat._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**GAT_2hops**

In [None]:
st = time.time()
model_trans_gat = torch.load('Models/reasoner/clg_10e4_TransGAT')
model_trans_gat._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**GAT_2hops + Filter**

In [None]:
df_train_filter = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_train_filter, _, _ = create_graph(df_train_filter)
df_test_filter = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_test_filter, _, _ = create_graph(df_test_filter)

In [None]:
st = time.time()
model_filter_trans_gat = torch.load('Models/reasoner/clg_10e4_FilterTransGAT')
model_filter_trans_gat._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test, g_test_filter)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

## CaLiGraph 10e5

In [7]:
db_ = CLG_dbs[1]
path = db_['path']
train_file = db_['train_file']
test_file = db_['test_file']

print('Running...', train_file, test_file)

df_train = load_clg_files(path+train_file)
g_train, nodes_train, edges_train = create_graph(df_train)
print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
df_test = load_clg_files(path+test_file)
g_test, nodes_test, edges_test = create_graph(df_test)
print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
print()

Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 265139, # Nodes: 198896, # Edges: 265139
# Test - Triplets: 75757, # Nodes: 73461, # Edges: 75757



**GAT**

In [None]:
st = time.time()
model_gat = torch.load('Models/reasoner/clg_10e5_GAT')
model_gat._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**GAT_2hops**

In [None]:
st = time.time()
model_trans_gat = torch.load('Models/reasoner/clg_10e5_TransGAT')
model_trans_gat._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

**GAT_2hops + Filter**

In [8]:
df_train_filter = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_train_filter, _, _ = create_graph(df_train_filter)
df_test_filter = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_test_filter, _, _ = create_graph(df_test_filter)

In [10]:
st = time.time()
model_filter_trans_gat = torch.load('Models/reasoner/clg_10e5_FilterTransGAT')
model_filter_trans_gat._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test, g_test_filter)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')

head, relation -> tail?
hits@1: 0.528, hits@10: 0.709
-------------------------------------------
Run time: 4996 seconds, 83 minutes
