# Librairies

In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import time
import random
random.seed(10)

from node2vec import Node2Vec

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
print(torch.__version__)

1.13.1


In [3]:
print(torch_geometric.__version__)

2.4.0


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# OWL2Bench

In [5]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [6]:
for db_ in OWL2Bench_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
# Train - Triplets: 7989, # Nodes: 3633, # Edges: 7989
# Test - Triplets: 2283, # Nodes: 1836, # Edges: 2283

GAT:
Epoch: 0, Loss: 0.2979
Epoch: 50, Loss: 0.0485
Epoch: 100, Loss: 0.0291
Epoch: 150, Loss: 0.0158
Epoch: 200, Loss: 0.0114
Epoch: 250, Loss: 0.0156
Epoch: 300, Loss: 0.0166
Precision: 0.667, Recall: 0.252, F1-Score: 0.366

head, relation -> tail?
hits@1: 0.004, hits@10: 0.041
-------------------------------------------
Run time: 37 seconds, 1 minutes

GAT_2hops:
Epoch: 0, Loss: 0.3345
Epoch: 50, Loss: 0.0917
Epoch: 100, Loss: 0.0647


KeyboardInterrupt: 

# ORE

In [None]:
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3'}]

In [None]:
for db_ in ORE_dbs:    
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_ore_files(path+train_file)
    df_train = df_train[df_train['p'] == 'ClassAssertion']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_ore_files(path+test_file)
    df_test = df_test[df_test['p'] == 'ClassAssertion']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

# CaLiGraph

In [None]:
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt'}]

In [None]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file = db_['train_file']
    test_file = db_['test_file']

    print('Running...', train_file, test_file)

    df_train = load_clg_files(path+train_file)
    df_train = df_train[df_train['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_train, nodes_train, edges_train = create_graph(df_train)
    print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
    df_test = load_clg_files(path+test_file)
    df_test = df_test[df_test['p'] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    g_test, nodes_test, edges_test = create_graph(df_test)
    print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
    print()
    
    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()

    st = time.time()
    model = GNN()
    model._train(g_train, 'GAT_2hops')
    model._eval(g_test, min(g_test.number_of_nodes(),100), 'GAT_2hops')
    et = time.time()
    elapsed_time = et - st
    print(f'Run time: {elapsed_time:.0f} seconds, {elapsed_time/60:.0f} minutes')
    print()