# Librairies

In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

from node2vec import Node2Vec

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

1.13.1


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# OWL2Bench

In [5]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [6]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 105, # Nodes/Classes: 113, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 45, # Edges: 30

GAT:
Epoch: 0, Loss: 0.3855
Epoch: 50, Loss: 0.1080
Epoch: 100, Loss: 0.0991
Epoch: 150, Loss: 0.0976
Epoch: 200, Loss: 0.0859
Epoch: 250, Loss: 0.0900
Epoch: 300, Loss: 0.0717
Epoch: 350, Loss: 0.0687
Epoch: 400, Loss: 0.0610
Epoch: 450, Loss: 0.0653
Epoch: 500, Loss: 0.0763
Epoch: 550, Loss: 0.0634
Epoch: 600, Loss: 0.0584
Epoch: 650, Loss: 0.0617
Epoch: 700, Loss: 0.0643
Epoch: 750, Loss: 0.0506
Epoch: 800, Loss: 0.0522
Precision: 0.500, Recall: 0.133, F1-Score: 0.211

head, relation -> tail?
hits@1: 0.100, hits@10: 0.333
-------------------------------------------
Run time: 4 seconds, 0 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3506
Epoch: 50, Loss: 0.1675
Epoch: 100, Loss: 0.1327
Epoch: 150, Loss: 0.1503
Epoch: 200, Loss: 0.0972
Epoch: 250, Loss: 0.0937
Epoch: 300, Loss: 0.0795
Epoch: 350, Loss: 0.0788
Epoch: 400, Loss: 0.09

# ORE

In [7]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [8]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 8194, # Nodes/Classes: 6654, # Edges: 8194
# Test - Triplets: 2342, # Nodes/Classes: 3052, # Edges: 2342

GAT:
Epoch: 0, Loss: 0.3287
Epoch: 50, Loss: 0.2361
Epoch: 100, Loss: 0.2224
Epoch: 150, Loss: 0.2202
Epoch: 200, Loss: 0.2172
Epoch: 250, Loss: 0.2196
Epoch: 300, Loss: 0.2120
Epoch: 350, Loss: 0.2144
Epoch: 400, Loss: 0.2115
Epoch: 450, Loss: 0.2111
Epoch: 500, Loss: 0.2106
Epoch: 550, Loss: 0.2114
Epoch: 600, Loss: 0.2074
Epoch: 650, Loss: 0.2099
Epoch: 700, Loss: 0.2101
Epoch: 750, Loss: 0.2101
Epoch: 800, Loss: 0.2071
Precision: 0.572, Recall: 0.267, F1-Score: 0.364

head, relation -> tail?
hits@1: 0.013, hits@10: 0.091
-------------------------------------------
Run time: 118 seconds, 2 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3406
Epoch: 50, Loss: 0.2720
Epoch: 100, Loss: 0.2490
Epoch: 150, Loss: 0.2283
Epoch: 200, Loss: 0.2240
Epoch: 250, Loss: 0.2227
Epoch: 300, Loss: 0.2212
Epoch: 350, Loss: 0.2201
Epoch: 400, Loss: 0.219

# CaLiGraph

In [9]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [10]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 59956, # Nodes/Classes: 10311, # Edges: 59956
# Test - Triplets: 17132, # Nodes/Classes: 7866, # Edges: 17132

GAT:
Epoch: 0, Loss: 0.3498
Epoch: 50, Loss: 0.1954
Epoch: 100, Loss: 0.1636
Epoch: 150, Loss: 0.1653
Epoch: 200, Loss: 0.1612
Epoch: 250, Loss: 0.1625
Epoch: 300, Loss: 0.1573
Epoch: 350, Loss: 0.1558
Epoch: 400, Loss: 0.1577
Epoch: 450, Loss: 0.1583
Epoch: 500, Loss: 0.1572
Epoch: 550, Loss: 0.1552
Epoch: 600, Loss: 0.1680
Epoch: 650, Loss: 0.1542
Epoch: 700, Loss: 0.1523
Epoch: 750, Loss: 0.1533
Epoch: 800, Loss: 0.1550
Precision: 0.814, Recall: 0.648, F1-Score: 0.721

head, relation -> tail?
hits@1: 0.306, hits@10: 0.523
-------------------------------------------
Run time: 599 seconds, 10 minutes

GAT_2hops:
Epoch: 0, Loss: 0.2596
Epoch: 50, Loss: 0.1841
Epoch: 100, Loss: 0.1748
Epoch: 150, Loss: 0.1771
Epoch: 200, Loss: 0.1759
Epoch: 250, Loss: 0.1751
Epoch: 300, Loss: 0.1728
Epoch: 350, Loss: 0.1723
Epoch