# Librairies

In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import random
random.seed(10)

from node2vec import Node2Vec

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

1.13.1


In [3]:
print(torch_geometric.__version__)

2.4.0


# OWL2Bench

In [4]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [5]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GAT', 'Random Init')
    model._eval(g_test, min(g_test.number_of_nodes(),100))
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 105, # Nodes/Classes: 113, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 45, # Edges: 30

GAT + Random Init:
Epoch: 0, Loss: 0.4806
Epoch: 50, Loss: 0.0795
Epoch: 100, Loss: 0.0711
Epoch: 150, Loss: 0.0632
Epoch: 200, Loss: 0.0644
Precision: 0.684, Recall: 0.433, F1-Score: 0.531

head, relation -> tail?
hits@1: 0.033, hits@10: 0.300, hits@100: 0.300


Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 105, # Nodes/Classes: 115, # Edges: 105
# Test - Triplets: 30, # Nodes/Classes: 43, # Edges: 30

GAT + Random Init:
Epoch: 0, Loss: 0.4984
Epoch: 50, Loss: 0.0952
Epoch: 100, Loss: 0.0689
Epoch: 150, Loss: 0.0467
Epoch: 200, Loss: 0.0383
Precision: 0.724, Recall: 0.700, F1-Score: 0.712

head, relation -> tail?
hits@1: 0.067, hits@10: 0.300, hits@100: 0.300




# ORE

In [6]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [7]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'SubClassOf']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'SubClassOf']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GAT', 'Random Init')
    model._eval(g_test, min(g_test.number_of_nodes(),100))
    print()

Running... _train_ORE1 _test_ORE1
# Train - Triplets: 8194, # Nodes/Classes: 6654, # Edges: 8194
# Test - Triplets: 2342, # Nodes/Classes: 3052, # Edges: 2342

GAT + Random Init:
Epoch: 0, Loss: 0.3831
Epoch: 50, Loss: 0.2063
Epoch: 100, Loss: 0.1985
Epoch: 150, Loss: 0.1954
Epoch: 200, Loss: 0.1963
Precision: 0.718, Recall: 0.518, F1-Score: 0.601

head, relation -> tail?
hits@1: 0.003, hits@10: 0.015, hits@100: 0.015


Running... _train_ORE2 _test_ORE2
# Train - Triplets: 8204, # Nodes/Classes: 6650, # Edges: 8204
# Test - Triplets: 2344, # Nodes/Classes: 3102, # Edges: 2344

GAT + Random Init:
Epoch: 0, Loss: 0.3822
Epoch: 50, Loss: 0.2064
Epoch: 100, Loss: 0.1989
Epoch: 150, Loss: 0.1985
Epoch: 200, Loss: 0.1926
Precision: 0.678, Recall: 0.676, F1-Score: 0.677

head, relation -> tail?
hits@1: 0.003, hits@10: 0.014, hits@100: 0.014


Running... _train_ORE3 _test_ORE3
# Train - Triplets: 8187, # Nodes/Classes: 6673, # Edges: 8187
# Test - Triplets: 2340, # Nodes/Classes: 3045, # Edges

# CaLiGraph

In [8]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [9]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes/Classes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes/Classes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    model = GNN()
    model._train(g_train, 'GAT', 'Random Init')
    model._eval(g_test, min(g_test.number_of_nodes(),100))
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt
# Train - Triplets: 59956, # Nodes/Classes: 10311, # Edges: 59956
# Test - Triplets: 17132, # Nodes/Classes: 7866, # Edges: 17132

GAT + Random Init:
Epoch: 0, Loss: 0.2529
Epoch: 50, Loss: 0.1712
Epoch: 100, Loss: 0.1641
Epoch: 150, Loss: 0.1616
Epoch: 200, Loss: 0.1714
Precision: 0.741, Recall: 0.651, F1-Score: 0.693

head, relation -> tail?
hits@1: 0.011, hits@10: 0.057, hits@100: 0.057


Running... clg_10e5-train.nt clg_10e5-test.nt
# Train - Triplets: 96273, # Nodes/Classes: 75195, # Edges: 96273
# Test - Triplets: 27508, # Nodes/Classes: 26675, # Edges: 27508

GAT + Random Init:
Epoch: 0, Loss: 0.4921
Epoch: 50, Loss: 0.1234
Epoch: 100, Loss: 0.1134
Epoch: 150, Loss: 0.1163
Epoch: 200, Loss: 0.1144
Precision: 0.848, Recall: 0.906, F1-Score: 0.876

head, relation -> tail?
hits@1: 0.014, hits@10: 0.080, hits@100: 0.080


