#### Analyzing Neighbor Sampler and HGT Loader  
- neighbor_sampler docs: [here](https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.NeighborLoader)  
- hgt_loader docs: [here](https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.HGTLoader)  

In [1]:
import os
import gc

import torch

from torch_geometric.data import HeteroData
from torch_geometric.datasets import DBLP
from torch_geometric.loader import NeighborLoader, HGTLoader

from utils import *

In [6]:
# Pilot Test file
path = os.path.join(os.getcwd(), 'data/DBLP')
dataset = DBLP(path)
data = dataset[0]

In [7]:
# Let's change the data for simplicity
# leave only author and paper
hetero_data = HeteroData()

# create two node types "author" and "paper" holding a feature matrix
hetero_data['author'].x = data['author'].x
hetero_data['author'].train_mask = torch.full((data['author'].x.shape[0], ), True)

# add train_mask (necessary for train/test split)
hetero_data['paper'].x = data['paper'].x
hetero_data['paper'].train_mask = torch.full((data['paper'].x.shape[0], ), True)

In [8]:
# Create an edge type and build the graph connectivity
# shape: (2, num_edges)
# Let's say our edge type is ('paper', 'written_by', 'author')
# then edge_index should look like
# [[paper, paper, paper, ...],
#  [author, author, author, ...]]
# so the 1st row of edge_index is source node
# and 2nd row of edge_index is target node (flow: source_to_target)
hetero_data['paper', 'written_by', 'author'].edge_index = data['paper', 'to', 'author']['edge_index']
hetero_data['author', 'write', 'paper'].edge_index = data['author', 'to', 'paper']['edge_index']

del data
_ = gc.collect()

In [9]:
print(hetero_data)

HeteroData(
  [1mauthor[0m={
    x=[4057, 334],
    train_mask=[4057]
  },
  [1mpaper[0m={
    x=[14328, 4231],
    train_mask=[14328]
  },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 19645] },
  [1m(author, write, paper)[0m={ edge_index=[2, 19645] }
)


In [11]:
print(hetero_data[('paper', 'written_by', 'author')].edge_index)

tensor([[    0,     1,     2,  ..., 14327, 14327, 14327],
        [  262,   263,   263,  ...,   324,  1068,  3647]])


In [12]:
print(hetero_data[('author', 'write', 'paper')].edge_index)

tensor([[    0,     0,     1,  ...,  4054,  4055,  4056],
        [ 2364,  6457,  2365,  ..., 13891, 13891, 13892]])


#### Neighbor Sampler

In [19]:
# neighbor sampler
train_input_nodes = ('author', hetero_data['author'].train_mask)
# train_input_nodes = ('paper', hetero_data['paper'].train_mask)
kwargs = {'batch_size': 5}

train_loader = NeighborLoader(
    hetero_data,
    # Sample 1 neighbors for each node for 1 iterations
    # it's okay to understand as num_iteration equals num_layers in graph convolutional layer
    num_neighbors=[1] * 1,
    shuffle=True,
    input_nodes=train_input_nodes,
    **kwargs
)


In [20]:
# Initialize lazy parameters via forwarding a single batch to the model
device = get_device()
batch = next(iter(train_loader))
batch = batch.to(device, 'edge_index')

In [21]:
# you can see that the shape of edge_index is (2, 5)
# because batch_size=5, num_neighbors=1
print(batch)

HeteroData(
  [1mauthor[0m={
    x=[5, 334],
    train_mask=[5],
    batch_size=5
  },
  [1mpaper[0m={
    x=[5, 4231],
    train_mask=[5]
  },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 5] },
  [1m(author, write, paper)[0m={ edge_index=[2, 0] }
)


In [22]:
# let's change a little bit
train_loader = NeighborLoader(
    hetero_data,
    num_neighbors=[10] * 1,
    shuffle=True,
    input_nodes=train_input_nodes,
    batch_size=2,
)

batch = next(iter(train_loader))
batch = batch.to(device, 'edge_index')
print(batch)

HeteroData(
  [1mauthor[0m={
    x=[2, 334],
    train_mask=[2],
    batch_size=2
  },
  [1mpaper[0m={
    x=[8, 4231],
    train_mask=[8]
  },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 8] },
  [1m(author, write, paper)[0m={ edge_index=[2, 0] }
)


In [23]:
# we cannot predict the actual shape of edge_index
# because the chosen author may not have enough neighbors as you set
print(batch[('paper', 'written_by', 'author')].edge_index)

tensor([[0, 1, 2, 3, 4, 5, 6, 7],
        [0, 0, 0, 1, 1, 1, 1, 1]], device='cuda:0')


#### HGT Loader

`HGTLoader` is the child class of `BaseDataLoader`  

```python
class BaseDataLoader(DataLoader):
    r"""Extends the :class:`torch.utils.data.DataLoader` by integrating a
    custom :meth:`self.transform_fn` function to allow transformation of a
    returned mini-batch directly inside the main process.
    """
    def _get_iterator(self) -> Iterator:
        iterator = super()._get_iterator()
        if hasattr(self, 'transform_fn'):
            iterator = DataLoaderIterator(iterator, self.transform_fn)
        return iterator
```

**num_samples**  
The number of nodes
to sample in each iteration and for each node type.  
If given as a list, will sample the same amount of nodes for each node type.

**input_nodes**  
The indices of nodes for which neighbors are sampled to create mini-batches.  
Needs to be passed as a tuple that holds the node type and corresponding node indices.  
If node indices are set to :obj: `None`, all nodes of this specific type will be considered.  

In [72]:
# let's find out how many papers "author 0" wrote
e = hetero_data[('paper', 'written_by', 'author')].edge_index
print(list(torch.where(e[1, :] == 0)[0].numpy()))

[3022, 8517]


In [73]:
# since "author 0" has 2 neighbors
# we can predict that the shape of batch edge_index will be (2, 2)
# regardless of num_samples argument

train_input_nodes = ('author', hetero_data['author'].train_mask)

train_loader = HGTLoader(
    hetero_data,
    num_samples=[32] * 4,
    # note that I set shuffle argument as False
    shuffle=False,
    input_nodes=train_input_nodes,
    batch_size=1
    )

In [76]:
batch = next(iter(train_loader))
batch = batch.to(device, 'edge_index')
print(batch[('paper', 'written_by', 'author')].edge_index.cpu().numpy())

[[0 1]
 [0 0]]
