In [1]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(name='PubMed', root='/Dataset/PyG/PubMed/Raw')
g = dataset.data 
g

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])

In [2]:
from collections import Counter 


def count_label(label, total):
    N = len(label)
    counter = Counter(label.tolist())
    d = dict(counter)
    
    print(f"{len(d)}类，{N} ({int(N * 100 / total)}%)")
    
    sum_ = sum(d.values())
    cnt_list = list(d.items())
    cnt_list.sort(key=lambda x: -x[1])
    
    str_list = []
    
    for lb, cnt in cnt_list:
        percent = int(cnt * 100 / sum_)
        str_list.append(f"{lb}: {cnt} ({percent}%)") 

    print(', '.join(str_list))


num_nodes = g.num_nodes 
feat = g.x 
edge_index = tuple(g.edge_index)
label = g.y 
train_mask = g.train_mask
val_mask = g.val_mask
test_mask = g.test_mask

count_label(label, num_nodes)
count_label(label[train_mask], num_nodes)
count_label(label[val_mask], num_nodes)
count_label(label[test_mask], num_nodes)

3类，19717 (100%)
2: 7875 (39%), 1: 7739 (39%), 0: 4103 (20%)
3类，60 (0%)
1: 20 (33%), 0: 20 (33%), 2: 20 (33%)
3类，500 (2%)
2: 208 (41%), 1: 194 (38%), 0: 98 (19%)
3类，1000 (5%)
1: 413 (41%), 2: 407 (40%), 0: 180 (18%)


In [3]:
import dgl 

g = dgl.graph(edge_index, num_nodes=num_nodes)
g.ndata['feat'] = feat 
g.ndata['label'] = label
g.ndata['train_mask'] = train_mask
g.ndata['val_mask'] = val_mask
g.ndata['test_mask'] = test_mask

g

Graph(num_nodes=19717, num_edges=88648,
      ndata_schemes={'feat': Scheme(shape=(500,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [4]:
import pickle 

g = dgl.to_bidirected(g, copy_ndata=True)
g = dgl.add_self_loop(dgl.remove_self_loop(g))

with open('/Dataset/PyG/PubMed/Processed/PubMed.dglg.pkl', 'wb') as fp:
    pickle.dump(g, fp)

g 

Graph(num_nodes=19717, num_edges=108365,
      ndata_schemes={'feat': Scheme(shape=(500,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})