# 1 论文附加领域信息

## 1.1 读取论文-领域映射表

In [2]:
from tqdm import tqdm 
from typing import Iterator, Any, Optional 
import json 
import pickle 

In [3]:
paper_field_map: dict[int, list[int]] = dict() 

with open('./output/paper_field_mapping.tsv', 'r') as fp:
    for line in tqdm(fp, total=2_0956_8870):
        paper_id, field_ids = line.split('\t')
        paper_id = int(paper_id)
        field_ids = [int(x) for x in field_ids.split(',')]
        
        paper_field_map[paper_id] = field_ids

100%|██████████| 209568870/209568870 [14:50<00:00, 235321.05it/s] 


## 1.2 关联论文和领域，并落盘

In [4]:
def parse_int(obj) -> Optional[int]:
    try:
        return int(obj)
    except Exception:
        return None 
    

with open('./output/paper_with_field.json', 'w', encoding='utf-8') as w:
    with open('/home/Dataset/MAG/mag_20211108/mag/Papers.txt', 'r', encoding='utf-8') as r:
        for line in tqdm(r, total=2_6945_1039):
            columns = line.split('\t')
            
            paper = dict(
                id = int(columns[0]),
                rank = parse_int(columns[1]),
                doi = columns[2].strip(),
                doc_type = columns[3].strip(),
                title = columns[4].strip(),
                original_title = columns[5].strip(),
                book_title = columns[6].strip(),
                year = parse_int(columns[7]), 
                date = columns[8].strip(),
                online_date = columns[9].strip(),
                publisher = columns[10].strip(),
                journal_id = parse_int(columns[11]), 
                conference_id = parse_int(columns[12]), 
                conference_instance_id = parse_int(columns[13]), 
                volume = columns[14].strip(), 
                issue = columns[15].strip(), 
                first_page = columns[16].strip(), 
                last_page = columns[17].strip(), 
                reference_count = parse_int(columns[18]), 
                citation_count = parse_int(columns[19]),  
                estimated_citation = parse_int(columns[20]),  
                original_venue = columns[21].strip(), 
                family_id = parse_int(columns[22]),  
                family_rank = parse_int(columns[23]),  
                doc_sub_types = columns[24].strip(), 
                created_date = columns[25].strip(), 
            )
            
            paper_id = paper['id']
            field_ids = paper_field_map.get(paper_id, [])
            paper['field_ids'] = field_ids
            
            json_str = json.dumps(paper, ensure_ascii=False).strip() 
            print(json_str, file=w)

100%|██████████| 269451039/269451039 [2:24:03<00:00, 31173.20it/s]  


## 1.3 释放内存

In [5]:
del paper_field_map

# 2 论文附加引用关系

## 2.1 读取论文引用关系映射表

In [3]:
paper_cite_map: dict[int, list[int]] = dict() 

with open('./output/paper_cite_mapping.tsv', 'r') as fp:
    for line in tqdm(fp, total=8988_7095):
        paper_id, cite_paper_ids = line.split('\t')
        paper_id = int(paper_id)
        cite_paper_ids = [int(x) for x in cite_paper_ids.split(',')]
        
        paper_cite_map[paper_id] = cite_paper_ids
        
paper_cited_map: dict[int, list[int]] = dict() 

with open('./output/paper_cited_mapping.tsv', 'r') as fp:
    for line in tqdm(fp, total=1_0365_6193):
        paper_id, cited_paper_ids = line.split('\t')
        paper_id = int(paper_id)
        cited_paper_ids = [int(x) for x in cited_paper_ids.split(',')]
        
        paper_cited_map[paper_id] = cited_paper_ids

100%|██████████| 89887095/89887095 [13:03<00:00, 114729.81it/s] 
100%|██████████| 103656193/103656193 [17:41<00:00, 97641.03it/s] 


## 2.2 关联论文和引用关系，并落盘

In [4]:
with open('./output/paper_with_field_citation.json', 'w', encoding='utf-8') as w:
    with open('./output/paper_with_field.json', 'r', encoding='utf-8') as r:
        for line in tqdm(r, total=2_6945_1039):
            paper = json.loads(line)
            paper_id = paper['id']
            
            cite_paper_ids = paper_cite_map.get(paper_id, [])
            cited_paper_ids = paper_cited_map.get(paper_id, [])
            
            paper['cite_paper_ids'] = cite_paper_ids 
            paper['cited_paper_ids'] = cited_paper_ids
            
            json_str = json.dumps(paper, ensure_ascii=False).strip() 
            print(json_str, file=w) 

100%|██████████| 269451039/269451039 [3:08:59<00:00, 23761.58it/s]  


## 2.3 释放内存

In [None]:
del paper_cite_map
del paper_cited_map

# 3 论文附加作者信息

## 3.1 读取论文-作者映射表

In [3]:
paper_author_map: dict[int, list[list]] = dict() 

with open('./output/paper_author_mapping.tsv', 'r') as fp:
    for line in tqdm(fp):
        paper_id, author_list = line.split('\t')
        paper_id = int(paper_id)
        author_list = json.loads(author_list)
        
        paper_author_map[paper_id] = author_list

269412163it [27:09, 165334.28it/s]


## 3.2 关联论文和作者，并落盘

In [4]:
with open('./output/paper_with_field_citation_author.json', 'w', encoding='utf-8') as w:
    with open('./output/paper_with_field_citation.json', 'r', encoding='utf-8') as r:
        for line in tqdm(r, total=2_6945_1039):
            paper = json.loads(line)
            paper_id = paper['id']
            
            author_list = paper_author_map.get(paper_id, [])
            author_list = [dict(author_id=x[0], affiliation_id=x[1], author_seq=x[2]) for x in author_list]
            
            paper['author_list'] = author_list 
            
            json_str = json.dumps(paper, ensure_ascii=False).strip() 
            print(json_str, file=w) 

100%|██████████| 269451039/269451039 [3:43:38<00:00, 20080.19it/s]  


## 3.3 释放内存

In [None]:
del paper_author_map