In [1]:
import os
import os.path as osp

import torch
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid

from ogb.nodeproppred import PygNodePropPredDataset


def prep_data(dataset_name:str, K:int):
    """ standardize format of data object """
    possible_datasets = ['cora', 'pubmed', 'products', 'arxiv']
    dataset_name = dataset_name.lower()
    assert dataset_name in possible_datasets, f'Dataset {dataset_name} not available'

    # download data 
    if dataset_name=='arixv':
        transform = T.Compose([
            # T.NormalizeFeatures(),
            T.ToUndirected(),
            T.AddSelfLoops(),
            T.SIGN(K)
        ])
    else:
        transform = T.Compose([
            # T.NormalizeFeatures(),
            T.SIGN(K)
        ])

    if dataset_name in ['arxiv','products']:
        dataset = PygNodePropPredDataset(
            f'ogbn-{dataset_name}',
            root=path,
            transform=transform
            )
    else:
        dataset = Planetoid(
            root=path,
            name=dataset_name.title(),
            transform=transform,
            split='full'
            )

    # extract relevant information
    data = dataset[0]
    data.dataset_name = dataset_name.lower()
    data.num_nodes, data.num_feats = data.x.shape
    data.num_classes = dataset.num_classes
    data.n_id = torch.arange(data.num_nodes)  # global node id

    # standardize idx max
    if hasattr(dataset, 'get_idx_split'):
        masks = dataset.get_idx_split()
        data.train_mask = masks['train']
        data.val_mask = masks['valid']
        data.test_mask = masks['test']
        data.y = data.y.flatten()
    else:
        data.train_mask = torch.where(data.train_mask)[0]
        data.val_mask = torch.where(data.val_mask)[0]
        data.test_mask = torch.where(data.test_mask)[0]
    return data


# create directory
folder_path = osp.join(os.getcwd(), 'data')

if not osp.exists(folder_path):
    os.makedirs(folder_path)
    print("Directory '% s' created" % folder_path)


  from .autonotebook import tqdm as notebook_tqdm
Using backend: pytorch


Directory '/home/jharris/Desktop/approx_attention/setup/data' created


In [2]:
DATASET = 'cora'

path = osp.join(folder_path, DATASET)

if not osp.exists(path):
    os.makedirs(path)

K = 5
data = prep_data(DATASET, K)

for i in range(K,-1,-1):
    filename = osp.join(path, f'{DATASET}_sign_k{i}.pth')
    torch.save(data, filename)
    del data[f'x{i}']
del data 


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [3]:
DATASET = 'pubmed'

path = osp.join(folder_path, DATASET)

if not osp.exists(path):
    os.makedirs(path)

K = 5
data = prep_data(DATASET, K)

for i in range(K,-1,-1):
    filename = osp.join(path, f'{DATASET}_sign_k{i}.pth')
    torch.save(data, filename)
    del data[f'x{i}']
del data 


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!


In [4]:
DATASET = 'arxiv'

path = osp.join(folder_path, DATASET)

if not osp.exists(path):
    os.makedirs(path)

K = 5
data = prep_data(DATASET, K)

for i in range(K,-1,-1):
    filename = osp.join(path, f'{DATASET}_sign_k{i}.pth')
    torch.save(data, filename)
    del data[f'x{i}']
del data 


Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:07<00:00, 11.52it/s]


Extracting /home/jharris/Desktop/approx_attention/setup/data/arxiv/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 13486.51it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 4056.39it/s]

Saving...



Done!


In [None]:
DATASET = 'products'

path = osp.join(folder_path, DATASET)

if not osp.exists(path):
    os.makedirs(path)

K = 5
data = prep_data(DATASET, K)

for i in range(K, -1, -1):
    filename = osp.join(path, f'{DATASET}_sign_k{i}.pth')
    torch.save(data, filename)
    print(filename)
    del data[f'x{i}']
del data


Downloading http://snap.stanford.edu/ogb/data/nodeproppred/products.zip


Downloaded 1.38 GB: 100%|██████████| 1414/1414 [01:21<00:00, 17.27it/s]


Extracting /home/jharris/Desktop/approx_attention/setup/data/products/products.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:02<00:00,  2.05s/it]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 3851.52it/s]


Saving...


Done!


/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k5.pth
/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k4.pth
/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k3.pth
/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k2.pth
/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k1.pth
/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k0.pth


: 

In [None]:
import torch 
torch.load('/home/jharris/Desktop/approx_attention/setup/data/products/products_sign_k5.pth')

  from .autonotebook import tqdm as notebook_tqdm


Data(num_nodes=2449029, edge_index=[2, 123718280], y=[2449029], x1=[2449029, 100], x2=[2449029, 100], x3=[2449029, 100], x4=[2449029, 100], x5=[2449029, 100], x0=[2449029, 100], dataset_name='products', num_feats=100, num_classes=47, n_id=[2449029], train_mask=[196615], val_mask=[39323], test_mask=[2213091])