In [1]:
import os
import csv
import json
import random
import pickle
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict

In [2]:
media_data_root = '/shared/0/projects/news-quotes/'
mag_dir = '/shared/0/datasets/mag/raw_data/'

def fpath(filename):
    return os.path.join(media_data_root, filename)

def yield_one_line(filename, delimiter=',', quoting = csv.QUOTE_ALL):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter=delimiter, quoting=quoting)
        count = 0
        for row in reader:
            count += 1
            if count % 10000000 == 0:
                print('processed %d lines...' % (count))
            yield row

In [3]:
with open(media_data_root+'dois_mentioned.pickle', 'rb') as ofile:
    dois_mentioned = pickle.load(ofile)

In [4]:
'' in dois_mentioned

False

In [5]:
len(dois_mentioned)

292418

In [6]:
pid_to_doi = {}
doi_to_pid = {}

In [9]:
for line in yield_one_line(mag_dir+'Papers.csv', delimiter=',', quoting=csv.QUOTE_ALL):
    pid, doi = line[0], line[2]
    # lowercase all DOIs
    doi = doi.lower()
    if doi in dois_mentioned:
        pid_to_doi[pid] = doi
        doi_to_pid[doi] = pid

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...


MAG `pid` and `doi` is not uniquely matched

In [10]:
len(pid_to_doi)

279603

In [11]:
len(doi_to_pid)

273558

In [12]:
sorted(Counter(pid_to_doi.values()).items(), key=lambda x: x[1], reverse=True)[:10]

[('10.1111/j.0042-7092.2007.00700.x', 325),
 ('10.4172/2327-5162.s1.006', 40),
 ('10.1002/ajpa.22718', 32),
 ('10.1096/fj.1530-6860', 18),
 ('10.4172/2155-9864-c1-028', 18),
 ('10.1017/cbo9780511606632', 16),
 ('10.1007/s10339-014-0632-2', 15),
 ('10.1001/jama.283.20.2653', 12),
 ('10.1017/cbo9780511977244', 12),
 ('10.1140/epjc/s10052-015-3451-4', 12)]

In [13]:
for pid, doi in pid_to_doi.items():
    if doi == '10.1001/jama.283.20.2653':
        print(pid)

148199778
187660587
1504222470
1515456818
1547332218
1580475749
1590605802
1592369239
1971009629
2033690336
2062676870
2141124189


In [14]:
dois_valid = set()

for doi, cn in Counter(pid_to_doi.values()).items():
    if cn == 1:
        dois_valid.add(doi)

In [15]:
len(dois_valid)

269866

In [16]:
for pid in list(pid_to_doi.keys()):
    doi = pid_to_doi[pid]
    if doi not in dois_valid:
        del pid_to_doi[pid]
        
for doi in list(doi_to_pid.keys()):
    if doi not in dois_valid:
        del doi_to_pid[doi]

In [21]:
len(pid_to_doi)

269866

In [18]:
len(doi_to_pid)

269866

In [20]:
len(pid_to_doi) / len(dois_mentioned)

0.9228775246393861

In [19]:
del dois_valid

Abstract

In [22]:
dois_abs = {}

for line in yield_one_line(mag_dir+'PaperAbstractsInvertedIndex.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, abs_dict = line
    try:
        abs_dict = json.loads(abs_dict)
    except:
        continue
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        length = abs_dict["IndexLength"]
        text = [''] * length
        for word, ixs in abs_dict['InvertedIndex'].items():
            for ix in ixs:
                text[ix] = word
        dois_abs[doi] = ' '.join(text)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...


In [23]:
len(dois_abs)

156679

In [24]:
with open(fpath('dois_abstract.json'), 'w') as ofile:
    for doi, text in dois_abs.items():
        row = {'doi': doi, 'abs': text}
        ofile.write(json.dumps(row) + '\n')

Authors

In [25]:
dois_authors_mag = defaultdict(list)

In [26]:
for line in yield_one_line(mag_dir+'PaperAuthorAffiliations.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, aid, aff_id, seq, name = line[:5]
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        dois_authors_mag[doi].append((aid, aff_id, seq, name))

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [27]:
len(dois_authors_mag)

269866

In [52]:
len(dois_authors_mag) / len(doi_to_pid)

1.0

In [28]:
dois_authors_mag['10.1001/2012.jama.11132']

[('164305248', '', '7', 'Scott M Grundy'),
 ('171232130', '867280407', '6', 'Ramin Farzaneh-Far'),
 ('1691429694', '1299303238', '4', 'Tiffany M. Powell-Wiley'),
 ('1893993938', '', '1', 'Ian J. Neeland'),
 ('2008178114', '', '2', 'Aslan T. Turer'),
 ('2091543069', '', '9', 'Darren K McGuire'),
 ('2096306022', '', '5', 'Gloria Lena Vega'),
 ('2109761996', '', '8', 'Amit Khera'),
 ('2129691995', '', '3', 'Colby R. Ayers'),
 ('2560969913', '', '10', 'James A. de Lemos')]

In [29]:
len(dois_authors_mag['10.1001/2012.jama.11132'])

10

In [53]:
with open(fpath('dois_authors_mag.json'), 'w') as ofile:
    for doi in dois_authors_mag:
        pid = doi_to_pid[doi]
        authors = dois_authors_mag[doi]
        row = {'doi': doi, 'mag_pid': pid, 'authors': authors}
        ofile.write(json.dumps(row) + '\n')

In [30]:
all_aids = set()
for doi in dois_authors_mag:
    authors = dois_authors_mag[doi]
    for aid, aff_id, seq, name in authors:
#         seq = int(seq)
#         if seq == 1:
        all_aids.add(aid)

In [31]:
len(all_aids)

1041143

In [32]:
aids_metric = dict()

for line in yield_one_line(mag_dir+'Authors.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    aid, rank, num_paper, num_cite = line[0], line[1], line[5], line[6]
    if aid in all_aids:
        aids_metric[aid] = (rank, num_paper, num_cite)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...


In [33]:
len(aids_metric)

1041143

In [34]:
with open(fpath('aids_metric_mag.json'), 'w') as ofile:
    for aid, info in aids_metric.items():
        row = {'aid': aid, 'metric': info}
        ofile.write(json.dumps(row) + '\n')

In [None]:
dois_disc = defaultdict(list)

for line in yield_one_line(mag_dir+'PaperFieldsOfStudy.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, fid, frac = line
    frac = float(frac)
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        dois_disc[doi].append((fid, frac))

In [46]:
len(dois_disc)

268867

In [47]:
dois_disc['10.1001/2012.jama.11132']

[('2780320433', 0.526666343),
 ('2779668308', 0.6762392),
 ('2780221984', 0.5918762),
 ('511355011', 0.544381559),
 ('2777391703', 0.6243656),
 ('555293320', 0.591008365),
 ('2910068830', 0.600785553),
 ('141071460', 0.387458026),
 ('2777180221', 0.6201425),
 ('71924100', 0.410550684)]

In [48]:
with open(fpath('dois_fields_mag.json'), 'w') as ofile:
    for doi in dois_disc:
        fields = dois_disc[doi]
        row = {'doi': doi, 'fields': fields}
        ofile.write(json.dumps(row) + '\n')