In [1]:
import pandas as pd
import numpy as np
import operator
import gzip
import networkx as nx
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, Linear, to_hetero
from torch_geometric.utils import negative_sampling

from sklearn.metrics import precision_score, recall_score, f1_score

from src.utils import *
from src.gnn import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2'}]

In [4]:
db_ = OWL2Bench_dbs[1]    
path = db_['path']
train_file = db_['train_file']
test_file = db_['test_file']

print('Running...', train_file, test_file)

df_train = load_ore_files(path+train_file)
df_train = df_train[df_train['p'] == 'SubClassOf']
g_train, nodes_train, edges_train = create_graph(df_train)
print(f'# Train - Triplets: {len(df_train)}, # Nodes: {g_train.number_of_nodes()}, # Edges: {g_train.number_of_edges()}')
df_test = load_ore_files(path+test_file)
df_test = df_test[df_test['p'] == 'SubClassOf']
g_test, nodes_test, edges_test = create_graph(df_test)
print(f'# Test - Triplets: {len(df_test)}, # Nodes: {g_test.number_of_nodes()}, # Edges: {g_test.number_of_edges()}')
print()

Running... _train_OWL2Bench2 _test_OWL2Bench2
# Train - Triplets: 105, # Nodes: 115, # Edges: 105
# Test - Triplets: 30, # Nodes: 43, # Edges: 30



In [5]:
adj = nx.to_scipy_sparse_array(g_train)
adj_2hops = adj.dot(adj)

pos_edge_index = torch_geometric.utils.from_scipy_sparse_matrix(adj)[0]
pos_edge_index_2hops = torch_geometric.utils.from_scipy_sparse_matrix(adj_2hops)[0]

neg_edge_index = negative_sampling(pos_edge_index)
neg_edge_index_2hops = negative_sampling(pos_edge_index_2hops)

edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
edge_index_2hops = torch.cat([pos_edge_index_2hops, neg_edge_index_2hops], dim=1).to(device)

num_nodes = g_train.number_of_nodes()
node_embed = torch.rand(num_nodes, 200).to(device)

model = GAT_2hops(200, 200).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
targets = torch.cat([torch.ones(pos_edge_index.shape[1]), torch.zeros(neg_edge_index.shape[1])])
edge_index, targets = shuffle_predictions_targets(edge_index, targets, device)

In [6]:
for i in range(1 + 1):
    model.train()
    optimizer.zero_grad()

    output = model(node_embed, edge_index, edge_index_2hops).to(device)
    
    u = torch.index_select(output, 0, edge_index[0, :])
    v = torch.index_select(output, 0, edge_index[1, :])
    pred = torch.sum(u * v, dim=-1)
    pred = ((pred - pred.min()) / (pred.max() - pred.min()))

    loss = mse_loss(pred, targets)
    loss.backward()
    optimizer.step()

    if i % 50 == 0:
        hits1, hits10 = eval_hits(tail_pred=1, g_test=g_test, pos_edge_index=pos_edge_index.to(device), output=output, max_num=min(g_test.number_of_nodes(),100), device=device)
        print(f'Epoch: {i}, Loss: {loss:.4f}, Hits@1: {hits1:.3f}, Hits@10: {hits10:.3f}')

Epoch: 0, Loss: 0.3647, Hits@1: 0.010, Hits@10: 0.276


In [7]:
with torch.no_grad():
    model.eval()
    
    adj = nx.to_scipy_sparse_array(g_test)
    adj_2hops = adj.dot(adj)
    
    pos_edge_index = torch_geometric.utils.from_scipy_sparse_matrix(adj)[0]
    pos_edge_index_2hops = torch_geometric.utils.from_scipy_sparse_matrix(adj_2hops)[0]
    
    neg_edge_index = pos_edge_index_2hops
    neg_edge_index_2hops = negative_sampling(pos_edge_index_2hops)
    
    edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1).to(device)
    edge_index_2hops = torch.cat([pos_edge_index_2hops, neg_edge_index_2hops], dim=1)
    edge_index_2hops = edge_index_2hops.to(torch.int64).to(device)

    output = model(node_embed, edge_index, edge_index_2hops).to(device)

In [8]:
print(f'head, relation -> tail?')
hits1, hits10 = eval_hits(tail_pred=1, g_test=g_test, pos_edge_index=pos_edge_index.to(device), output=output, max_num=min(g_test.number_of_nodes(),100), device=device)
print(f'hits@1: {hits1:.3f}, hits@10: {hits10:.3f}')
print()

head, relation -> tail?
hits@1: 0.033, hits@10: 0.367



In [11]:
def subclass_transitivity_penalty(pos_edge_index, output):    
    '''SubClass Transitivity: A is a subclass of B, B is a subclass of C, then A must be a subclass of C. 
    If A is not a subclass of C in the first 10 predictions, we assign a penalty.'''

    count = 0
    penalty = 0

    for i in range(pos_edge_index.size(1)):
        A = pos_edge_index[0][i]
        B = pos_edge_index[1][i]

        indices = ((pos_edge_index[0] == B).nonzero(as_tuple=True)[0])
        for idx in indices:
            C = pos_edge_index[1][idx]

            dist = torch.cdist(output, torch.index_select(output, 0, A), p=2)
            dist_dict = {i: dist[i] for i in range(0, len(dist))}

            sorted_dict = dict(sorted(dist_dict.items(), key=operator.itemgetter(1), reverse=True))
            sorted_keys = list(sorted_dict.keys())

            ranks_dict = {sorted_keys[i]: i for i in range(0, len(sorted_keys))}
            rank = ranks_dict[C.item()]

            count += 1
            if rank > 100:
                penalty += 1
    return count, penalty

In [13]:
count, penalty = subclass_transitivity_penalty(pos_edge_index.to(device), output)
print(f'penalty: {penalty:.4f}, count: {count:.4f}, penalty/count: {(penalty/count):.4f}')

penalty: 1.0000, count: 5.0000, penalty/count: 0.2000
