In [1]:
import pandas as pd
import numpy as np
import random
random.seed(10)

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import MetaPath2Vec

from src.utils import *
from src.gnn import *
from src.metapath2vec import *

# OWL2Bench

In [2]:
OWL2Bench_dbs = [{'path' : './datasets/OWL2Bench/OWL2Bench1/',
                  'train_file' : '_train_OWL2Bench1',
                  'test_file' : '_test_OWL2Bench1',
                  'file' : 'OWL2Bench1'},
                 {'path' : './datasets/OWL2Bench/OWL2Bench2/',
                  'train_file' : '_train_OWL2Bench2',
                  'test_file' : '_test_OWL2Bench2',
                  'file' : 'OWL2Bench2'}]

In [3]:
# ORE
ORE_dbs = [{'path' : './datasets/ORE/ORE1/',
            'train_file' : '_train_ORE1',
            'test_file' : '_test_ORE1',
            'file' : 'ORE1'},
           {'path' : './datasets/ORE/ORE2/',
            'train_file' : '_train_ORE2',
            'test_file' : '_test_ORE2',
            'file' : 'ORE2'},
           {'path' : './datasets/ORE/ORE3/',
            'train_file' : '_train_ORE3',
            'test_file' : '_test_ORE3',
            'file' : 'ORE3'}]

In [4]:
# CaLiGraph
CLG_dbs = [{'path' : './datasets/clg/clg_10e4/',
            'train_file' : 'clg_10e4-train.nt',
            'test_file' : 'clg_10e4-test.nt',
            'file' : 'clg_10e4'},
           {'path' : './datasets/clg/clg_10e5/',
            'train_file' : 'clg_10e5-train.nt',
            'test_file' : 'clg_10e5-test.nt',
            'file' : 'clg_10e5'}]

In [5]:
db_ = ORE_dbs[1]
path = db_['path']
train_file = db_['train_file']
test_file = db_['test_file']
file = db_['file']

In [6]:
df_train = load_ore_files(path+train_file)
df_test = load_ore_files(path+test_file)
g_train, g_test = get_heterodata(df_train, df_test)

node_types, edge_types = g_train.metadata()
for node in node_types: 
    g_train[node].x = torch.rand(g_train[node].num_nodes, 10)

In [7]:
node_types, edge_types = g_train.metadata()

In [13]:
metapath = edge_types
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = MetaPath2Vec(g_train.edge_index_dict, embedding_dim=128,
                     metapath=metapath, walk_length=10, context_size=10,
                     walks_per_node=5, num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [14]:
g_train[('Class', 'SubClassOf', 'Class')]

{'edge_index': tensor([[   0,    2,    4,  ..., 6648, 2478, 2582],
        [   1,    3,    5,  ..., 6649,  685, 6423]])}

In [15]:
g_train

HeteroData(
  Class={ num_nodes=9200 },
  (Class, SubClassOf, Class)={ edge_index=[2, 8204] },
  (Class, ClassAssertion, Class)={ edge_index=[2, 53081] },
  (Class, EquivalentClasses, Class)={ edge_index=[2, 4] },
  (Class, rev_ SubClassOf, Class)={ edge_index=[2, 8204] },
  (Class, rev_ ClassAssertion, Class)={ edge_index=[2, 53081] },
  (Class, rev_ EquivalentClasses, Class)={ edge_index=[2, 4] },
  (Class, rev_SubClassOf, Class)={}
)

In [16]:
g_train[('Class', 'rev_ SubClassOf', 'Class')]

{'edge_index': tensor([[   1,    3,    5,  ..., 6649,  685, 6423],
        [   0,    2,    4,  ..., 6648, 2478, 2582]])}

In [17]:
next(iter(loader))

IndexError: index 8204 is out of bounds for dimension 0 with size 8204

In [13]:
for i, (pos_rw, neg_rw) in enumerate(loader):
    print(i)
    print(pos_rw)
    print(neg_rw)
    break

IndexError: index 105 is out of bounds for dimension 0 with size 105

In [30]:
def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, Acc: {acc:.4f}'))

In [31]:
for i in enumerate(loader):
    print(i)

IndexError: index 105 is out of bounds for dimension 0 with size 105

In [12]:
for epoch in range(1, 6):
    train(model, loader, epoch)
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\julie\anaconda3\lib\site-packages\torch\utils\data\_utils\worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "C:\Users\julie\anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\julie\anaconda3\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 193, in _sample
    return self._pos_sample(batch), self._neg_sample(batch)
  File "C:\Users\julie\anaconda3\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 151, in _pos_sample
    batch = sample(
  File "C:\Users\julie\anaconda3\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 248, in sample
    col = col[rand]
IndexError: index 105 is out of bounds for dimension 0 with size 105
