In [1]:
# coding: utf-8
import pandas as pd
import json
import traceback

In [2]:
with open('./data/computer_science_keywords.json', 'r') as f:
    cs_keywords = json.load(f, encoding='utf-8')
    keywords_list = list(cs_keywords.keys())
    cs_avg = sum(list(cs_keywords.values())) / float(len(keywords_list))
    cs_keywords['computer science'] **= 2
with open('./data/physics_keywords.json', 'r') as f:    
    physics_keywords = json.load(f, encoding='utf-8')
    physics_avg = sum(list(physics_keywords.values())) / float(len(keywords_list))
    physics_keywords['physics'] **= 2

In [3]:
print(cs_avg, max(cs_keywords.values()))

-755.7698440314393 389453379844.0


In [51]:
cp_ref = {}

In [52]:
content = []
for file in range(140, 160):
    print('Read file', file)
    with open('./data/mag_papers_%d.txt' % file) as f:
        content = f.readlines()
    
    new_data = {}
    for i, line in enumerate(content):
        obj = json.loads(line)

        if 'authors' not in obj or 'title' not in obj or 'year' not in obj:
            continue

        # title
        title = obj['title'] 

        # Authors list
        authors = []
        for author in obj['authors']:
            authors.append(author['name'])
        authors = json.dumps(authors, ensure_ascii=False)

        # n_citation
        n_citation = obj['n_citation'] if 'n_citation' in obj else 0

        # references list
        references = json.dumps(obj['references'], ensure_ascii=False) if 'references' in obj else '[]'

        # venue
        if 'venue' in obj and len(obj['venue']) > 0:
            venue = obj['venue']
        elif 'publisher' in obj and len(obj['publisher']) > 0:
            venue = obj['publisher']
        else:
            venue = ''

        # year
        year = obj['year']

        # fos list
        if 'fos' in obj:
            fos = json.dumps(obj['fos'], ensure_ascii=False).lower()

            cs_score = 0
            physics_score = 0
            is_found = False
            for word in obj['fos']:
                lword = word.lower()
                cs_score += cs_keywords[lword] if lword in cs_keywords else 0
                physics_score += physics_keywords[lword] if lword in physics_keywords else 0
            if cs_score > cs_avg:
                field = 'Computer Science'
                is_found = True
            elif physics_score > physics_avg:
                field = 'Physics'
                is_found = True
            else:
                field = 'N/A'

            new_data[obj['id']] = {'authors':authors, 'n_citation': n_citation, 'references':references, 'title':title,
                              'venue':venue, 'year':year, 'fos':fos, 'field': field}
            if is_found and 'references' in obj:
                cp_ref[obj['id']] = obj['references']
                
    print(len(new_data))
    
    df = pd.DataFrame.from_dict(new_data, orient='index').reset_index()
    df.rename(columns=dict(zip(df.columns[[0]], ['id'])),inplace=True)
    df = df.set_index('id')
    df.to_csv('./data/mag_%d.csv' % file, encoding='utf-8')

Read file 140
662311
Read file 141
661314
Read file 142
661729
Read file 143
661509
Read file 144
661398
Read file 145
659741
Read file 146
661275
Read file 147
661673
Read file 148
660863
Read file 149
661513
Read file 150
661609
Read file 151
661286
Read file 152
660812
Read file 153
661830
Read file 154
662077
Read file 155
661524
Read file 156
661035
Read file 157
661390
Read file 158
660997
Read file 159
661308


In [41]:
with open('./data/cp_ref_7.json', 'w') as f:
    json.dump(cp_ref, f, ensure_ascii=False)

In [53]:
df = pd.read_csv('./data/mag_140.csv').set_index('id')

In [54]:
for file in range(141, 160):
    print(file)
    df1 = pd.read_csv('./data/mag_%d.csv' % file).set_index('id')
    df = pd.concat([df, df1])

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159


In [55]:
df.to_csv('./data/mag_refined7.csv', encoding='utf-8')

#### Remove unnecessary ids

In [3]:
# Get all needed id
nece_id = {}
for file in range(0, 9):
    print('file', file)
    with open('./data/cp_ref_%d.json' % file, 'r') as f:
        obj = json.load(f, encoding='utf-8')
        for idx, iid in enumerate(obj):
            nece_id[iid] = True
            for rid in obj[iid]:
                nece_id[rid] = True

file 0
file 1
file 2
file 3
file 4
file 5
file 6
file 7
file 8


with open('./data/all_id.json', 'w') as f:
    json.dump(nece_id, f, ensure_ascii=False)

In [3]:
with open('./data/all_id.json', 'r') as f:
    nece_id = json.load(f, encoding='utf-8')

In [4]:
for file in range(6, 9):
    df = cp_df = na_df = na_dict = cp_ref_dict = cp_ref_df = new_df = None
    print('Read', file)
    df = pd.read_csv('./data/mag_refined%d.csv' % file, encoding='utf-8').set_index('id').fillna('N/A')
    print('Read',file,'finished')
    cp_df = df.loc[(df['field'] == 'Computer Science') | (df['field'] == 'Physics')]
    na_df = df.loc[(df['field'] != 'Computer Science') & (df['field'] != 'Physics')]
    na_dict = na_df.to_dict('index')
    print('Convert to dict')
    cp_ref_dict = {}
    for idx, iid in enumerate(na_dict):
        if idx%1000000==0:
            print(idx,)
        row = na_dict[iid]
        if iid in nece_id:
            ref_list = json.loads(row['references'])
            new_ref_list = []
            for ref in ref_list:
                if ref in nece_id:
                    new_ref_list.append(ref)
            row['references'] = json.dumps(new_ref_list, ensure_ascii=False)
            cp_ref_dict[iid] = row
    print('Convert to dataframe')
    cp_ref_df = pd.DataFrame.from_dict(cp_ref_dict, orient='index').reset_index()
    cp_ref_df.rename(columns=dict(zip(cp_ref_df.columns[[0]], ['id'])),inplace=True)
    cp_ref_df = cp_ref_df.set_index('id')
    print('Concat')
    new_df = pd.concat([cp_df, cp_ref_df], sort=True)
    print('length:',len(new_df))
    new_df.to_csv('./data/mag_reduced_%d.csv' % file, encoding='utf-8')
    print('To csv finished')

Read 6
Read 6 finished
Convert to dict
0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
Convert to dataframe
Concat
length: 3782267
To csv finished
Read 7
Read 7 finished
Convert to dict
0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
Convert to dataframe
Concat
length: 3782513
To csv finished
Read 8
Read 8 finished
Convert to dict
0
1000000
2000000
3000000
Convert to dataframe
Concat
length: 1171350
To csv finished


In [2]:
with open('./data/all_id.json', 'r') as f:
    nece_id = json.load(f, encoding='utf-8')

In [3]:
# Refernce reduction
for file in range(6, 9):
    print('Read', file)
    df = pd.read_csv('./data/mag_reduced_%d.csv' % file, encoding='utf-8', dtype={'field':str}).set_index('id')
    print('To dict')
    dic = df.to_dict('index')
    df = None
    print('Check references')
    new_dic = {}
    for idx, iid in enumerate(dic):
        print(idx) if idx % 1000000 == 0 else None
        # Paper in other fields
        if iid not in nece_id:
            continue
        row = dic[iid]
        new_ref = []
        ref_list = json.loads(row['references'])
        for ref in ref_list:
            # Preserve necessary references
            if ref in nece_id:
                new_ref.append(ref)
        row['references'] = json.dumps(new_ref)
        new_dic[iid] = row
    print('Convert to dataframe')
    cp_ref_df = pd.DataFrame.from_dict(new_dic, orient='index').reset_index()
    cp_ref_df.rename(columns=dict(zip(cp_ref_df.columns[[0]], ['id'])),inplace=True)
    cp_ref_df = cp_ref_df.set_index('id')
    print('length:',len(cp_ref_df), 'Prev length', len(dic))
    cp_ref_df.to_csv('./data/mag_new_reduced_%d.csv' % file, encoding='utf-8')
    print('To csv finished')

Read 6
To dict
Check references
0
1000000
2000000
3000000
Convert to dataframe
length: 2852357 Prev length 3782267
To csv finished
Read 7
To dict
Check references
0
1000000
2000000
3000000
Convert to dataframe
length: 2854833 Prev length 3782513
To csv finished
Read 8
To dict
Check references
0
1000000
Convert to dataframe
length: 884396 Prev length 1171350
To csv finished


In [4]:
cp_ref_df.head()

Unnamed: 0_level_0,authors,field,fos,n_citation,references,title,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f6559ed0-c806-4662-856e-b29232a911bb,"[""Claudia Menghi"", ""Claudia Gatta"", ""Alberto V...",Computer Science,"[""operating system"", ""rev""]",50,"[""8c0c3545-c16e-497e-87ad-570d6e97d43f""]",Parasitosis adquirida por consumo de sushi,Revista Argentina De Microbiologia,2007
f6559f41-9737-4ee0-9f1e-ace9e11c2153,"[""E. M. Egorova"", ""A. A. Revina""]",,"[""stereochemistry"", ""chemistry"", ""analytical c...",50,"[""af4e3a2a-0f38-4ae7-87e4-0ab4347cf39e""]",Optical Properties and Sizes of Silver Nanopar...,Colloid Journal,2002
f6559faa-b8f4-41dc-8982-576ffd2a7388,"[""Marco Chacin"", ""Kazuya Yoshida""]",,"[""control engineering"", ""mobile robot"", ""compu...",50,"[""0570b47a-a732-425b-901d-00e2571660dc"", ""0790...",MULTI-LIMBED ROVER FOR ASTEROID SURFACE EXPLOR...,,2005
f655a070-0da7-46d6-b95b-6b6c9d20f265,"[""Alejandro Cáceres"", ""Suzanne S. Sindi"", ""Ben...",,"[""genome-wide association study"", ""linkage dis...",50,"[""0219395f-e9af-404b-80d3-fcb9f5b43f46"", ""08ab...",Identification of polymorphic inversions from ...,BMC Bioinformatics,2012
f655a0bb-ebcf-4d7c-8020-ab1d7b96ab75,"[""Sorin Tunaru"", ""Jukka Kero"", ""Annette Schaub...",,"[""biology"", ""endocrinology"", ""biochemistry"", ""...",622,"[""06d26ce5-40a0-42c8-8257-8367c86c4c1b"", ""0933...",PUMA-G and HM74 are receptors for nicotinic ac...,Nature Medicine,2003


In [5]:
nonexisted_id = []
for iid in new_dic:
    if iid not in nece_id:
        nonexisted_id.append(iid)

In [6]:
len(nonexisted_id)

0