<a href="https://colab.research.google.com/github/kbrezinski/CS224W-GraphML/blob/main/notebooks/practice_deep_encoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
print(torch.__version__)

from platform import python_version
print(python_version())

1.11.0+cu113
3.7.13


In [2]:
!pip install torch-scatter torch-sparse \
 torch-cluster torch-spline-conv torch-geometric \
-f https://data.pyg.org/whl/torch-1.11.0+cu113.html -q
#!pip install ogb

[K     |████████████████████████████████| 7.9 MB 5.3 MB/s 
[K     |████████████████████████████████| 3.5 MB 40.8 MB/s 
[K     |████████████████████████████████| 2.5 MB 39.8 MB/s 
[K     |████████████████████████████████| 750 kB 48.7 MB/s 
[K     |████████████████████████████████| 407 kB 5.4 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [3]:
import os
import numpy as np

from torch_geometric.data import Dataset, Data, Batch
from torch_geometric.loader import ClusterData, NeighborLoader, DataLoader

In [None]:
# torch_geometric.data.batch.Batch 
# -creates batch of disconencted graphs from list

# torch_geometric.data.data.Data
# -creates single graph object

# torch_geometric.data.cluster.ClusterData/ClusterLoader
# -group nodes into smaller subgraphs and load them in batches for faster computation

# torch_geometric.data.sampler.NeighborSampler
# -samples specific number of nodes in neighbor
# -sample training nodes only using node_idx

In [15]:
import pickle

rows = np.random.choice(100, 500)
cols = np.random.choice(100, 500)

data = dict(x = torch.rand((100, 16), dtype=torch.float), # 100 nodes, 16 features)
            edge_index = torch.tensor([rows, cols]), # (2, 500) random edges
            edge_attr = np.random.choice(3, 500), # 500 edges, choose from 0, 1 or 2
            y = torch.rand(100).round().long(),
)  

os.makedirs('data/raw', exist_ok=True)

with open('./data/g0.pickle', 'wb') as f:
  pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
g = Data(**data)
g

Data(x=[100, 16], edge_index=[2, 500], edge_attr=[500], y=[100])

In [17]:
# create batch of graphs using from_data_list
g2 = g
batch = Batch().from_data_list([g, g2])

In [18]:
# cluster data in 5 partitions, loop through N/5 nodes and retrieve edges
cluster = ClusterData(g, 5)
for c in cluster:
  print(c)
  break

Data(x=[20, 16], edge_attr=[500], y=[20], edge_index=[2, 36])


Computing METIS partitioning...
Done!


In [19]:
# for 4 nodes in the graph, sample (3 * 4), (10 * 40) neighbors for each iteration
sampler = NeighborLoader(g, num_neighbors=[3, 10],
                         batch_size=4, shuffle=False,
                         input_nodes=None) #data.train_mask,)
for s in sampler:
  print(s)
  break

Data(x=[54, 16], edge_index=[2, 64], edge_attr=[500], y=[54], batch_size=4)


In [41]:
import torch_geometric.transforms as transforms
# torch_geometric.transforms - list of functions transformations for graphs
# example transforms
# pre-transform - does so once its downloaded
# tranform - does so after its downloaded and retreived

transform = transforms.Compose([
            transforms.RandomNodeSplit('train_rest', num_val=50, num_test=50),
            transforms.TargetIndegree(),
])

In [49]:
import pickle

class SampleDataset(Dataset):
  def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
    super().__init__(root, transform, pre_transform, pre_filter)

  @property
  def raw_file_names(self):
    return ['g0.pickle']
    
  @property
  def processed_file_names(self):
    return ['g0.pt']

  def download(self):   
    rows = np.random.choice(100, 500)
    cols = np.random.choice(100, 500)

    data = dict(x = torch.rand((100, 16), dtype=torch.float), # 100 nodes, 16 features)
                edge_index = torch.tensor([rows, cols]), # (2, 500) random edges
                edge_attr = np.random.choice(3, 500), # 500 edges, choose from 0, 1 or 2
                y = torch.rand(100).round().long(),
    )  

    with open('data/raw/g0.pickle', 'wb') as f:
      pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

  def process(self):

    # create data object
    for file_name in self.raw_file_names:
      with open(os.path.join('data/raw', file_name), 'rb') as f:
        data = pickle.load(f)

        self.graph = Data(**data)

        if self.pre_filter is not None:
          continue

        if self.pre_transform is not None:
          self.graph = self.pre_transform(self.graph)

      torch.save(self.graph, os.path.join('data/processed',  file_name.split('.')[0] + '.pt'))

  def len(self):
    return len(self.processed_file_names)

  def get(self, idx):
    return torch.load(os.path.join('data/processed', f'g{idx}.pt'))
    
dataset = SampleDataset(root='data', transform=None, pre_transform=None)

In [51]:
#dataloader = DataLoader(dataset, batch_size=1)

data = next(iter(dataset))

# for 4 nodes in the graph, sample (3 * 4), (10 * 40) neighbors for each iteration
sampler = NeighborLoader(data, num_neighbors=[10] * 2,
                         batch_size=16, shuffle=True,
                         input_nodes=None) # pass in training/validation mask

In [53]:
for i in sampler:
  print(i)

Data(x=[93, 16], edge_index=[2, 270], edge_attr=[500], y=[93], batch_size=16)
Data(x=[97, 16], edge_index=[2, 315], edge_attr=[500], y=[97], batch_size=16)
Data(x=[96, 16], edge_index=[2, 298], edge_attr=[500], y=[96], batch_size=16)
Data(x=[97, 16], edge_index=[2, 324], edge_attr=[500], y=[97], batch_size=16)
Data(x=[95, 16], edge_index=[2, 326], edge_attr=[500], y=[95], batch_size=16)
Data(x=[95, 16], edge_index=[2, 287], edge_attr=[500], y=[95], batch_size=16)
Data(x=[62, 16], edge_index=[2, 87], edge_attr=[500], y=[62], batch_size=4)
