In [4]:
from ogb.nodeproppred import PygNodePropPredDataset
import torch 

dataset = PygNodePropPredDataset(name='ogbn-products', root='/Dataset/OGB/ogbn-products/Raw') 

split_idx = dataset.get_idx_split()
train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

g = dataset[0]
g

Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])

In [5]:
from collections import Counter 


def count_label(label, total):
    N = len(label)
    counter = Counter(label.tolist())
    d = dict(counter)
    
    print(f"{len(d)}类，{N} ({int(N * 100 / total)}%)")
    
    sum_ = sum(d.values())
    cnt_list = list(d.items())
    cnt_list.sort(key=lambda x: -x[1])
    
    str_list = []
    
    for lb, cnt in cnt_list:
        percent = int(cnt * 100 / sum_)
        str_list.append(f"{lb}: {cnt} ({percent}%)") 

    print(', '.join(str_list))


num_nodes = g.num_nodes 
feat = g.x 
edge_index = tuple(g.edge_index)
label = g.y.squeeze()
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx.squeeze()] = True 
val_mask[val_idx.squeeze()] = True 
test_mask[test_idx.squeeze()] = True 

count_label(label, num_nodes)
count_label(label[train_mask], num_nodes)
count_label(label[val_mask], num_nodes)
count_label(label[test_mask], num_nodes)

47类，2449029 (100%)
4: 668950 (27%), 7: 172199 (7%), 6: 158771 (6%), 3: 151061 (6%), 12: 131886 (5%), 2: 116043 (4%), 0: 114294 (4%), 8: 110796 (4%), 1: 109832 (4%), 13: 101541 (4%), 16: 83594 (3%), 21: 80795 (3%), 9: 67358 (2%), 10: 52345 (2%), 18: 49019 (2%), 24: 45406 (1%), 17: 42337 (1%), 5: 40715 (1%), 11: 32937 (1%), 42: 32500 (1%), 15: 26911 (1%), 20: 22575 (0%), 19: 17438 (0%), 23: 3653 (0%), 14: 3079 (0%), 25: 3024 (0%), 28: 1969 (0%), 29: 1561 (0%), 43: 1399 (0%), 22: 879 (0%), 36: 630 (0%), 44: 566 (0%), 26: 553 (0%), 37: 514 (0%), 32: 513 (0%), 31: 418 (0%), 30: 277 (0%), 27: 259 (0%), 34: 154 (0%), 38: 91 (0%), 41: 61 (0%), 35: 44 (0%), 39: 37 (0%), 33: 29 (0%), 45: 9 (0%), 40: 6 (0%), 46: 1 (0%)
42类，196615 (8%)
4: 60630 (30%), 7: 16615 (8%), 6: 15305 (7%), 3: 13153 (6%), 1: 12290 (6%), 0: 11332 (5%), 2: 10941 (5%), 8: 10926 (5%), 13: 10757 (5%), 9: 6266 (3%), 10: 4311 (2%), 17: 4245 (2%), 5: 3932 (1%), 11: 3348 (1%), 12: 2755 (1%), 15: 2515 (1%), 20: 2047 (1%), 21: 1313 (0

In [6]:
import dgl 

g = dgl.graph(edge_index, num_nodes=num_nodes)
g.ndata['feat'] = feat 
g.ndata['label'] = label
g.ndata['train_mask'] = train_mask
g.ndata['val_mask'] = val_mask
g.ndata['test_mask'] = test_mask

g

Graph(num_nodes=2449029, num_edges=123718280,
      ndata_schemes={'feat': Scheme(shape=(100,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [7]:
import pickle 

g = dgl.to_bidirected(g, copy_ndata=True)
g = dgl.add_self_loop(dgl.remove_self_loop(g))

with open('/Dataset/OGB/ogbn-products/Processed/ogbn-products.dglg.pkl', 'wb') as fp:
    pickle.dump(g, fp)

g 

Graph(num_nodes=2449029, num_edges=126167053,
      ndata_schemes={'feat': Scheme(shape=(100,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})