In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

from node2vec import Node2Vec

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

2.0.1+cu118


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# OWL2Bench

In [None]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [None]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train('GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train('GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

# OWL

In [None]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [None]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train('GAT', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train('GAT_2hops', g_train)
    model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

# CaLiGraph

In [5]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [6]:
db_ = CLG_dbs[0]
path = db_['path']
train_file = db_['train_file']
test_file = db_['test_file']

print('Running...', train_file, test_file)

df_train = load_clg_files(path+train_file)
g_train, nodes_train, edges_train = create_graph(df_train)
print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
df_test = load_clg_files(path+test_file)
g_test, nodes_test, edges_test = create_graph(df_test)
print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 127801, # Nodes: 24556, # Edges: 127801
# Test - Triplets: 36519, # Nodes: 13752, # Edges: 36519



In [6]:
st = time.time()
model = GNN()
model._train('GAT', g_train)
model._eval(min(g_test.number_of_nodes(),100), 'GAT', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
print()

st = time.time()
model = GNN()
model._train('GAT_2hops', g_train)
model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 127801, # Nodes: 24556, # Edges: 127801
# Test - Triplets: 36519, # Nodes: 13752, # Edges: 36519

GAT:
Epoch: 0, Loss: 0.3039
Epoch: 400, Loss: 0.1590
Epoch: 800, Loss: 0.1560
head, relation -> tail?
hits@1: 0.392, hits@10: 0.703
-------------------------------------------
Run time: 1373 seconds, 23 minutes

GAT_2hops:
Epoch: 0, Loss: 0.2418
Epoch: 400, Loss: 0.2169
Epoch: 800, Loss: 0.1896
head, relation -> tail?
hits@1: 0.103, hits@10: 0.539
-------------------------------------------
Run time: 1866 seconds, 31 minutes



In [8]:
df_train_filter = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_train_filter, _, _ = create_graph(df_train_filter)
df_test_filter = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
g_test_filter, _, _ = create_graph(df_test_filter)

st = time.time()
model = GNN()
model._train('GAT_2hops', g_train, g_train_filter)
model._eval(min(g_test.number_of_nodes(),100), 'GAT_2hops', g_test, g_test_filter)
et = time.time()
elapsed_time = et - st
print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
print()

GAT_2hops:
+ Filter...
Epoch: 0, Loss: 0.3014
Epoch: 400, Loss: 0.3652
Epoch: 800, Loss: 0.1907
head, relation -> tail?
hits@1: 0.561, hits@10: 0.751
-------------------------------------------
Run time: 1401 seconds, 23 minutes

