In [6]:
import csv 
import os 
import json 
import numpy as np 
import traceback 
from tqdm import tqdm 

with open(os.path.expanduser('~/dataset/HGB/raw/ACM/node.dat'), 'r', encoding='utf-8') as fp:
    reader = csv.DictReader(fp, fieldnames=['nid', 'name', 'node_type', 'feat'], delimiter='\t') 

    author_feat_list = [] 
    paper_feat_list = [] 
    subject_feat_list = [] 
    term_feat_list = [] 
    
    nid_map: dict[int, tuple[str, int]] = dict() 

    for row in tqdm(reader):
        nid = int(row['nid']) 
        name = row['name'].strip() 
        ntype = int(row['node_type']) 

        try:
            feat_str = row['feat'].strip() 
            feat = np.array(json.loads(f"[{feat_str}]"), dtype=np.float32)  
        except Exception:
            feat = None  
            
        if ntype == 0: 
            nid_map[nid] = ('paper', len(paper_feat_list)) 
            paper_feat_list.append(feat)
        elif ntype == 1: 
            nid_map[nid] = ('author', len(author_feat_list)) 
            author_feat_list.append(feat)
        elif ntype == 2: 
            nid_map[nid] = ('subject', len(subject_feat_list)) 
            subject_feat_list.append(feat) 
        elif ntype == 3: 
            nid_map[nid] = ('term', len(term_feat_list)) 
            term_feat_list.append(feat)
        else:
            raise AssertionError 
            
author_feat = np.stack(author_feat_list) 
paper_feat = np.stack(paper_feat_list) 
subject_feat = np.stack(subject_feat_list) 
assert all(x is None for x in term_feat_list) 
    
paper_feat.shape, \
author_feat.shape, \
subject_feat.shape, \
len(term_feat_list) 

10942it [00:02, 5226.95it/s]


((3025, 1902), (5959, 1902), (56, 1902), 1902)

In [7]:
with open(os.path.expanduser('~/dataset/HGB/raw/ACM/link.dat'), 'r', encoding='utf-8') as fp:
    reader = csv.DictReader(fp, fieldnames=['src_nid', 'dest_nid', 'etype', 'score'], delimiter='\t') 
    
    paper_paper_edge_list = []
    rev_paper_paper_edge_list = []
    paper_author_edge_list = []
    author_paper_edge_list = []
    paper_subject_edge_list = [] 
    subject_paper_edge_list = [] 
    paper_term_edge_list = [] 
    term_paper_edge_list = [] 
    
    for row in tqdm(reader):
        src_nid = int(row['src_nid'])
        dest_nid = int(row['dest_nid']) 
        etype = int(row['etype'])
        score = float(row['score']) 
        assert score == 1. 
        
        if etype == 0:
            assert nid_map[src_nid][0] == 'paper' and nid_map[dest_nid][0] == 'paper'
            paper_paper_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 1:
            assert nid_map[src_nid][0] == 'paper' and nid_map[dest_nid][0] == 'paper'
            rev_paper_paper_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 2:
            assert nid_map[src_nid][0] == 'paper' and nid_map[dest_nid][0] == 'author'
            paper_author_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 3:
            assert nid_map[src_nid][0] == 'author' and nid_map[dest_nid][0] == 'paper'
            author_paper_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 4:
            assert nid_map[src_nid][0] == 'paper' and nid_map[dest_nid][0] == 'subject'
            paper_subject_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 5:
            assert nid_map[src_nid][0] == 'subject' and nid_map[dest_nid][0] == 'paper'
            subject_paper_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 6:
            assert nid_map[src_nid][0] == 'paper' and nid_map[dest_nid][0] == 'term'
            paper_term_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        elif etype == 7:
            assert nid_map[src_nid][0] == 'term' and nid_map[dest_nid][0] == 'paper'
            term_paper_edge_list.append((nid_map[src_nid][1], nid_map[dest_nid][1])) 
        else:
            raise AssertionError 

paper_paper_edge_index = np.array(paper_paper_edge_list, dtype=np.int64).T 
rev_paper_paper_edge_index = np.array(rev_paper_paper_edge_list, dtype=np.int64).T 
paper_author_edge_index = np.array(paper_author_edge_list, dtype=np.int64).T 
author_paper_edge_index = np.array(author_paper_edge_list, dtype=np.int64).T 
paper_subject_edge_index = np.array(paper_subject_edge_list, dtype=np.int64).T 
subject_paper_edge_index = np.array(subject_paper_edge_list, dtype=np.int64).T 
paper_term_edge_index = np.array(paper_term_edge_list, dtype=np.int64).T 
term_paper_edge_index = np.array(term_paper_edge_list, dtype=np.int64).T 

paper_paper_edge_index.shape, \
rev_paper_paper_edge_index.shape, \
paper_author_edge_index.shape, \
author_paper_edge_index.shape, \
paper_subject_edge_index.shape, \
subject_paper_edge_index.shape, \
paper_term_edge_index.shape, \
term_paper_edge_index.shape 

547872it [00:01, 336784.17it/s]


((2, 5343),
 (2, 5343),
 (2, 9949),
 (2, 9949),
 (2, 3025),
 (2, 3025),
 (2, 255619),
 (2, 255619))

In [8]:
with open(os.path.expanduser('~/dataset/HGB/raw/ACM/label.dat'), 'r', encoding='utf-8') as fp:
    reader = csv.DictReader(fp, fieldnames=['paper_nid', 'paper_name', 'node_type', 'paper_label'], delimiter='\t') 
    
    paper_label_arr = np.full([len(paper_feat)], fill_value=-1, dtype=np.int64)   
    paper_train_mask = np.zeros(len(paper_feat), dtype=bool) 
    
    for row in tqdm(reader):
        paper_nid = int(row['paper_nid']) 
        paper_name = row['paper_name'].strip() 
        ntype = int(row['node_type']) 
        assert ntype == 0 
        paper_label = int(row['paper_label'])
        assert paper_label in [0, 1, 2, 3] 
        
        assert nid_map[paper_nid][0] == 'paper'
        assert paper_label_arr[nid_map[paper_nid][1]] == -1 
        paper_label_arr[nid_map[paper_nid][1]] = paper_label
        paper_train_mask[nid_map[paper_nid][1]] = True 
    
paper_label_arr.shape, \
np.sum(paper_label_arr > -1), \
np.sum(paper_train_mask) 

907it [00:00, 156032.72it/s]


((3025,), 907, 907)

In [9]:
with open(os.path.expanduser('~/dataset/HGB/raw/ACM/label.dat.test'), 'r', encoding='utf-8') as fp:
    reader = csv.DictReader(fp, fieldnames=['paper_nid', 'paper_name', 'node_type', 'paper_label'], delimiter='\t') 
    
    for row in tqdm(reader):
        paper_nid = int(row['paper_nid']) 
        paper_name = row['paper_name'].strip() 
        ntype = int(row['node_type']) 
        assert ntype == 0 
        paper_label = int(row['paper_label'])
        assert paper_label in [0, 1, 2, 3] 
        
        assert nid_map[paper_nid][0] == 'paper'
        assert paper_label_arr[nid_map[paper_nid][1]] == -1 
        paper_label_arr[nid_map[paper_nid][1]] = paper_label
    
paper_label_arr.shape, \
np.sum(paper_label_arr > -1) 

2118it [00:00, 230615.40it/s]


((3025,), 3025)

In [10]:
import pickle 
import torch 

PP_edge_index = torch.tensor(paper_paper_edge_index, dtype=torch.int64) 
PP_edge_index = torch.cat([PP_edge_index, torch.flip(PP_edge_index, dims=[0])], dim=-1)
PP_edge_index = torch.unique(PP_edge_index, dim=-1) 

with open(os.path.expanduser(os.path.expanduser('~/dataset/HGB/processed/ACM_hg.dict.pkl')), 'wb') as fp:
    pickle.dump(
        dict(
            node_feat_dict = dict(
                author = torch.tensor(author_feat, dtype=torch.float32),  
                paper = torch.tensor(paper_feat, dtype=torch.float32),  
                subject = torch.tensor(subject_feat, dtype=torch.float32),  
            ), 
            num_nodes_dict = dict(
                author = len(author_feat), 
                paper = len(paper_feat), 
                subject = len(subject_feat), 
                term = len(term_feat_list), 
            ),
            edge_index_dict = {
                ('paper', 'PP', 'paper'): PP_edge_index,
                ('paper', 'PA', 'author'): torch.tensor(paper_author_edge_index, dtype=torch.int64), 
                ('author', 'AP', 'paper'): torch.tensor(author_paper_edge_index, dtype=torch.int64), 
                ('paper', 'PT', 'term'): torch.tensor(paper_term_edge_index, dtype=torch.int64), 
                ('term', 'TP', 'paper'): torch.tensor(term_paper_edge_index, dtype=torch.int64), 
                ('paper', 'PS', 'subject'): torch.tensor(paper_subject_edge_index, dtype=torch.int64), 
                ('subject', 'SP', 'paper'): torch.tensor(subject_paper_edge_index, dtype=torch.int64), 
            },
            paper_label = torch.tensor(paper_label_arr, dtype=torch.int64),  
            paper_train_mask = torch.tensor(paper_train_mask, dtype=torch.bool),
            paper_test_mask = torch.tensor(~paper_train_mask, dtype=torch.bool),
        ), 
        fp, 
    )