# 1 定义实体

In [99]:
from dataclasses import dataclass 
from typing import Any, Optional

Str = Optional[str]
Int = Optional[int]
Float = Optional[float]

In [100]:
from dataclasses import dataclass 


@dataclass 
class Affiliation:
    zhitu_id: Int = None
    mag_id: Int = None
    
    rank: Int = None
    name: Str = None 
    paper_count: Int = None 
    citation_count: Int = None 


@dataclass
class Author:
    zhitu_id: Int = None
    mag_id: Int = None
    
    rank: Int = None
    name: Str = None
    affiliation: Optional[Affiliation] = None
    paper_count: Int = None 
    citation_count: Int = None 
    
    
@dataclass 
class Field:
    zhitu_id: Int = None
    mag_id: Int = None
    
    rank: Int = None 
    name: Str = None 
    level: Int = None 
    paper_count: Int = None 
    citation_count: Int = None 
    
    
@dataclass
class Journal:
    mag_id: Int = None
    
    rank: Int = None 
    name: Str = None 
    paper_count: Int = None 
    citation_count: Int = None 
    
    
@dataclass
class Conference:
    mag_id: Int = None
    
    rank: Int = None 
    name: Str = None 
    abbr: Str = None 
    paper_count: Int = None 
    citation_count: Int = None 


@dataclass
class Paper:
    zhitu_id: Int = None 
    mag_id: Int = None
    author_list: Optional[list[Author]] = None 
    author_order: Optional[dict[Author, int]] = None 
    field_list: Optional[list[Field]] = None 
    
    rank: Int = None
    doi: Str = None
    doc_type: Str = None
    title: Str = None
    year: Int = None
    date: Str = None
    publisher: Str = None
    journal: Optional[Journal] = None 
    conference: Optional[Conference] = None 
    volume: Str = None
    issue: Str = None
    first_page: Str = None
    last_page: Str = None
    reference_count: Int = None
    citation_count: Int = None
    venue: Str = None      

# 2 读入学者成果等数据

## 2.1 读入机构

In [101]:
from tqdm import tqdm 

affiliation_index: dict[int, Affiliation] = dict() 

with open('/home/Dataset/MAG/mag_20211108/mag/Affiliations.txt', 'r', encoding='utf-8') as fp:
    for line in tqdm(fp):
        cols = line.split('\t')
        
        affiliation_id = int(cols[0])
        
        affiliation = Affiliation(
            mag_id = affiliation_id,
            rank = int(cols[1]), 
            name = cols[3].strip(), 
            paper_count = int(cols[7]), 
            citation_count = int(cols[9]),
        )
        
        affiliation_index[affiliation_id] = affiliation 

27063it [00:00, 297688.60it/s]


## 2.2 读入领域

In [102]:
field_index: dict[int, Field] = dict() 

with open('/home/Dataset/MAG/mag_20211108/advanced/FieldsOfStudy.txt', 'r', encoding='utf-8') as fp:
    for line in tqdm(fp):
        cols = line.split('\t')
        
        field_id = int(cols[0])
        
        field = Field(
            mag_id = field_id, 
            rank = int(cols[1]), 
            name = cols[3].strip(), 
            level = int(cols[5]), 
            paper_count = int(cols[6]),  
            citation_count = int(cols[8]),
        )
        
        field_index[field_id] = field 

714856it [00:02, 298850.64it/s]


## 2.3 读入期刊&会议

In [103]:
journal_index: dict[int, Journal] = dict() 
conference_index: dict[int, Conference] = dict() 

with open('/home/Dataset/MAG/mag_20211108/mag/Journals.txt', 'r', encoding='utf-8') as fp:
    for line in tqdm(fp):
        cols = line.split('\t')
        
        journal_id = int(cols[0])
        
        journal = Journal(
            mag_id = journal_id, 
            rank = int(cols[1]), 
            name = cols[3].strip(), 
            paper_count = int(cols[7]),  
            citation_count = int(cols[9]),
        )
        
        journal_index[journal_id] = journal  
        
with open('/home/Dataset/MAG/mag_20211108/mag/ConferenceSeries.txt', 'r', encoding='utf-8') as fp:
    for line in tqdm(fp):
        cols = line.split('\t')
        
        conference_id = int(cols[0])
        
        conference = Conference(
            mag_id = conference_id, 
            rank = int(cols[1]), 
            name = cols[3].strip(), 
            abbr = cols[2].strip(), 
            paper_count = int(cols[4]),  
            citation_count = int(cols[6]),
        )
        
        conference_index[conference_id] = conference  

49063it [00:00, 361422.22it/s]
4550it [00:00, 400631.54it/s]


## 2.4 期刊&会议重新排名

In [104]:
conference_list = list(conference_index.values())

conference_list.sort(key=lambda x: x.rank)

conference_index.clear() 

for rank, conference in enumerate(conference_list):
    conference.rank = rank + 1 
    conference_index[int(conference.mag_id)] = conference 

In [105]:
for conference in conference_index.values():
    if conference.abbr == 'APBC':
        print(conference)

Conference(mag_id=1200494941, rank=2538, name='Asia-Pacific Bioinformatics Conference', abbr='APBC', paper_count=291, citation_count=2945)


重新排序后的部分会议排名：

* KDD(A): 24
* AAAI(A): 10
* IJCAI(A): 15 
* ICML(A): 7
* CVPR(A): 1
* CIKM(B): 62
* EMNLP(B): 35
* COLING(B): 70
* WSDM(B): 234
* SDM(B): 149
* SSTD(C): 2063
* ECIR(C): 552
* ACCV(C): 297
* ICTAI(C): 277
* APBC(C): 2538

In [106]:
journal_list = list(journal_index.values())

journal_list.sort(key=lambda x: x.rank)

journal_index.clear() 

for rank, journal in enumerate(journal_list):
    journal.rank = rank + 1 
    journal_index[int(journal.mag_id)] = journal  

In [119]:
def normalize_str(s: str) -> str:
    out_str = str()
    
    for ch in s:
        if ch.isalnum():
            out_str += ch.lower() 
        else:
            out_str += ' '
            
    out_str = ' '.join(out_str.split())
    
    return out_str 

target = normalize_str('Knowledge-Based Systems')

for journal in journal_index.values():
    if target in normalize_str(journal.name):
        print(journal)

Journal(mag_id=10169007, rank=2741, name='Knowledge Based Systems', paper_count=6269, citation_count=151681)
Journal(mag_id=69518169, rank=6461, name='International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems', paper_count=1447, citation_count=33603)


重新排序后的部分期刊排名：

* TKDE(A): 1125
* TODS(A): 2135
* TPAMI(A): 97 
* JMLR(A): 607 
* AI(A): 584
* TKDD(B): 11535
* DKE(B): 4790
* DMKD(B): 3870
* Neurocomputing(C): 1017
* Expert Systems(C): 10354
* KBS(C): 2741 