In [4]:
import os
import csv
import json
import random
import pickle
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict

In [5]:
data_root = '/shared/0/datasets/mag/Promotion/'
mag_dir = '/shared/0/datasets/mag/raw_data/'

def fpath(filename):
    return os.path.join(data_root, filename)

def yield_one_line(filename, delimiter=',', quoting = csv.QUOTE_ALL):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter=delimiter, quoting=quoting)
        count = 0
        for row in reader:
            count += 1
            if count % 10000000 == 0:
                print('processed %d lines...' % (count))
            yield row

In [8]:
with open(data_root+'alt_dois.pickle', 'rb') as ofile:
    dois = pickle.load(ofile)

In [9]:
len(dois)

1218710

In [10]:
pid_to_doi = {}
doi_to_pid = {}

In [11]:
for line in yield_one_line(mag_dir+'Papers.csv', delimiter=',', quoting=csv.QUOTE_ALL):
    pid, doi = line[0], line[2]
    # lowercase all DOIs
    doi = doi.lower()
    if doi in dois:
        pid_to_doi[pid] = doi
        doi_to_pid[doi] = pid

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...


MAG `pid` and `doi` is not uniquely matched

In [12]:
len(pid_to_doi)

1150268

In [13]:
len(doi_to_pid)

1134395

In [14]:
sorted(Counter(pid_to_doi.values()).items(), key=lambda x: x[1], reverse=True)[:10]

[('10.13039/501100003329', 1238),
 ('10.13039/501100007273', 190),
 ('10.4000/terrain.16638', 67),
 ('10.1126/science.361.6406.988-c', 62),
 ('10.13039/501100003176', 48),
 ('10.1002/ajpa.23489', 35),
 ('10.1002/pds.4629', 32),
 ('10.4172/0976-4860-c1-002', 30),
 ('10.21767/2573-4482-c1-002', 29),
 ('10.4172/2573-0347-c9-041', 28)]

In [16]:
dois_valid = set()

for doi, cn in Counter(pid_to_doi.values()).items():
    if cn == 1:
        dois_valid.add(doi)

In [17]:
len(dois_valid)

1123240

In [18]:
for pid in list(pid_to_doi.keys()):
    doi = pid_to_doi[pid]
    if doi not in dois_valid:
        del pid_to_doi[pid]
        
for doi in list(doi_to_pid.keys()):
    if doi not in dois_valid:
        del doi_to_pid[doi]

In [19]:
len(pid_to_doi)

1123240

In [20]:
len(doi_to_pid)

1123240

In [21]:
len(pid_to_doi) / len(dois)

0.9216630699674245

In [22]:
del dois_valid

Authors

In [24]:
dois_authors_mag = defaultdict(list)

In [25]:
for line in yield_one_line(mag_dir+'PaperAuthorAffiliations.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, aid, aff_id, seq, name = line[:5]
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        dois_authors_mag[doi].append((aid, aff_id, seq, name))

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [26]:
len(dois_authors_mag)

1123240

In [27]:
len(dois_authors_mag) / len(doi_to_pid)

1.0

In [28]:
with open(fpath('dois_authors_mag.json'), 'w') as ofile:
    for doi in dois_authors_mag:
        pid = doi_to_pid[doi]
        authors = dois_authors_mag[doi]
        row = {'doi': doi, 'mag_pid': pid, 'authors': authors}
        ofile.write(json.dumps(row) + '\n')

In [6]:
dois_authors_mag = {}

with open(fpath('dois_authors_mag.json'), 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        doi, authors = row['doi'], row['authors']
        dois_authors_mag[doi] = authors

In [7]:
all_aids = set()
for doi in dois_authors_mag:
    authors = dois_authors_mag[doi]
    for aid, aff_id, seq, name in authors:
#         seq = int(seq)
#         if seq == 1:
        all_aids.add(aid)

In [8]:
len(all_aids)

3335676

In [31]:
aids_metric = dict()

for line in yield_one_line(mag_dir+'Authors.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    aid, rank, num_paper, num_cite = line[0], line[1], line[5], line[6]
    if aid in all_aids:
        aids_metric[aid] = (rank, num_paper, num_cite)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...


In [32]:
len(aids_metric)

3335676

In [33]:
with open(fpath('aids_metric_mag.json'), 'w') as ofile:
    for aid, info in aids_metric.items():
        row = {'aid': aid, 'metric': info}
        ofile.write(json.dumps(row) + '\n')

Author paper counts

In [9]:
pid_year = {}

for line in yield_one_line(mag_dir+'Papers.csv', delimiter=',', quoting=csv.QUOTE_ALL):
    pid, year = line[0], line[7]
    # lowercase all DOIs
    # doi = doi.lower()
    if year != '' and int(year) <= 2018:
        pid_year[pid] = year

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...


In [10]:
len(pid_year)

219618489

In [12]:
aid_year_paper_count = defaultdict(lambda: defaultdict(int))
aid_pids_upto_2017 = defaultdict(set)

for line in yield_one_line(mag_dir+'PaperAuthorAffiliations.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, aid = line[:2]
    if pid in pid_year:
        year = pid_year[pid]
        if aid in all_aids:
            aid_year_paper_count[aid][year] += 1
            # only count citations to these authors' papers published up to 2017.
            if int(year) <= 2017:
                aid_pids_upto_2017[aid].add(pid)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lines...
processed 360000000 lines...
processed 370000000 lines...
processed 380000000 lin

In [18]:
len(aid_year_paper_count)

3334118

In [19]:
# this counts pubs in all years.
with open(fpath('aid_year_paper_count.json'), 'w') as ofile:
    for aid, info in aid_year_paper_count.items():
        row = {'aid': aid, 'paper_cn': info}
        ofile.write(json.dumps(row) + '\n')

In [20]:
# many authors have 0 pubs before 2018 (their first pub was in 2018)
len(aid_pids_upto_2017)

2312690

In [13]:
pids_track = set()
for aid in aid_pids_upto_2017:
    for pid in aid_pids_upto_2017[aid]:
        pids_track.add(pid)

In [14]:
len(pids_track)

37436167

In [15]:
pid_cites_upto_2017 = defaultdict(int)

for line in yield_one_line(mag_dir+'PaperReferences.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, rid = line[0], line[1]
    if pid in pid_year:
        year = pid_year[pid]
        # citing paper has to be published before 2018.
        if int(year) <= 2017 and rid in pids_track:
            pid_cites_upto_2017[rid] += 1

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [16]:
# some papers may have 0 citations, so they won't be in this dictionary
len(pid_cites_upto_2017)

24255365

In [None]:
aid_cites_upto_2017 = {}

for aid in all_aids:
    aid_cites_upto_2017[aid] = 0
    if aid in aid_pids_upto_2017:
        for pid in aid_pids_upto_2017[aid]:
            aid_cites_upto_2017[aid] += pid_cites_upto_2017[pid]        

In [22]:
len(aid_cites_upto_2017)

2312690

In [26]:
len(aid_cites_upto_2017)

3335676

In [None]:
with open(fpath('aid_cites_upto_2017.json'), 'w') as ofile:
    for aid, cn in aid_cites_upto_2017.items():
        row = {'aid': aid, 'cites': cn}
        ofile.write(json.dumps(row) + '\n')

### Code not used below

Abstract

In [22]:
dois_abs = {}

for line in yield_one_line(mag_dir+'PaperAbstractsInvertedIndex.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, abs_dict = line
    try:
        abs_dict = json.loads(abs_dict)
    except:
        continue
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        length = abs_dict["IndexLength"]
        text = [''] * length
        for word, ixs in abs_dict['InvertedIndex'].items():
            for ix in ixs:
                text[ix] = word
        dois_abs[doi] = ' '.join(text)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...


In [23]:
len(dois_abs)

156679

In [24]:
with open(fpath('dois_abstract.json'), 'w') as ofile:
    for doi, text in dois_abs.items():
        row = {'doi': doi, 'abs': text}
        ofile.write(json.dumps(row) + '\n')

MAG keywords

In [None]:
dois_disc = defaultdict(list)

for line in yield_one_line(mag_dir+'PaperFieldsOfStudy.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, fid, frac = line
    frac = float(frac)
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        dois_disc[doi].append((fid, frac))

In [46]:
len(dois_disc)

268867

In [47]:
dois_disc['10.1001/2012.jama.11132']

[('2780320433', 0.526666343),
 ('2779668308', 0.6762392),
 ('2780221984', 0.5918762),
 ('511355011', 0.544381559),
 ('2777391703', 0.6243656),
 ('555293320', 0.591008365),
 ('2910068830', 0.600785553),
 ('141071460', 0.387458026),
 ('2777180221', 0.6201425),
 ('71924100', 0.410550684)]

In [48]:
with open(fpath('dois_fields_mag.json'), 'w') as ofile:
    for doi in dois_disc:
        fields = dois_disc[doi]
        row = {'doi': doi, 'fields': fields}
        ofile.write(json.dumps(row) + '\n')