In [1]:
import os
import csv
import json
import random
import pickle
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict

In [2]:
data_root = '/shared/0/datasets/mag/Retraction/'
mag_dir = '/shared/0/datasets/mag/raw_data/'

def fpath(filename):
    return os.path.join(data_root, filename)

def yield_one_line(filename, delimiter=',', quoting = csv.QUOTE_ALL):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter=delimiter, quoting=quoting)
        count = 0
        for row in reader:
            count += 1
            if count % 10000000 == 0:
                print('processed %d lines...' % (count))
            yield row

In [3]:
with open(data_root+'alt_dois.pickle', 'rb') as ofile:
    dois = pickle.load(ofile)

In [4]:
len(dois)

9471473

In [6]:
pid_to_doi = {}
doi_to_pid = {}

In [8]:
pid_year = {}

for line in yield_one_line(mag_dir+'Papers.csv', delimiter=',', quoting=csv.QUOTE_ALL):
    pid, doi, year = line[0], line[2], line[7]
    # lowercase all DOIs
    # doi = doi.lower()
    if year != '':
        year = int(year)
        # only need to count citations up to 2018
        if year <= 2018:
            pid_year[pid] = year
    # lowercase all DOIs
    doi = doi.lower()
    if doi in dois:
        pid_to_doi[pid] = doi
        doi_to_pid[doi] = pid

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...


In [36]:
len(pid_year)

219618489

MAG `pid` and `doi` is not uniquely matched

In [9]:
len(pid_to_doi)

9028035

In [10]:
len(doi_to_pid)

8839413

In [11]:
sorted(Counter(pid_to_doi.values()).items(), key=lambda x: x[1], reverse=True)[:10]

[('10.13039/501100003329', 1238),
 ('10.3968/%x', 991),
 ('10.13039/501100000780', 939),
 ('10.3968/n', 931),
 ('10.21615/ces', 671),
 ('10.13039/501100004837', 642),
 ('10.22264/clio', 551),
 ('10.12795/pixelbit', 510),
 ('10.1590/%x', 464),
 ('10.1241/johokanri.58.389', 292)]

In [12]:
dois_valid = set()

for doi, cn in Counter(pid_to_doi.values()).items():
    if cn == 1:
        dois_valid.add(doi)

In [13]:
len(dois_valid)

8725412

In [14]:
for pid in list(pid_to_doi.keys()):
    doi = pid_to_doi[pid]
    if doi not in dois_valid:
        del pid_to_doi[pid]
        
for doi in list(doi_to_pid.keys()):
    if doi not in dois_valid:
        del doi_to_pid[doi]

In [15]:
len(pid_to_doi)

8725412

In [16]:
len(doi_to_pid)

8725412

In [17]:
len(pid_to_doi) / len(dois)

0.9212307314817875

In [18]:
del dois_valid

Authors

In [19]:
dois_authors_mag = defaultdict(list)

In [20]:
for line in yield_one_line(mag_dir+'PaperAuthorAffiliations.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, aid, aff_id, seq, name = line[:5]
    if pid in pid_to_doi:
        doi = pid_to_doi[pid]
        dois_authors_mag[doi].append((aid, aff_id, seq, name))

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [21]:
len(dois_authors_mag)

8725412

In [22]:
len(dois_authors_mag) / len(doi_to_pid)

1.0

In [23]:
with open(fpath('dois_authors_mag.json'), 'w') as ofile:
    for doi in dois_authors_mag:
        pid = doi_to_pid[doi]
        authors = dois_authors_mag[doi]
        row = {'doi': doi, 'mag_pid': pid, 'authors': authors}
        ofile.write(json.dumps(row) + '\n')

In [24]:
dois_authors_mag = {}

with open(fpath('dois_authors_mag.json'), 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        doi, authors = row['doi'], row['authors']
        dois_authors_mag[doi] = authors

In [25]:
all_aids = set()
for doi in dois_authors_mag:
    authors = dois_authors_mag[doi]
    for aid, aff_id, seq, name in authors:
#         seq = int(seq)
#         if seq == 1:
        all_aids.add(aid)

In [26]:
len(all_aids)

12580393

Papers by these authors (up to 2018)

In [31]:
pids_by_these_authors = defaultdict(lambda: set())

for line in yield_one_line(mag_dir+'PaperAuthorAffiliations.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, aid, aff_id, seq, name = line[:5]
    if aid in all_aids:
        pids_by_these_authors[pid].add(aid)

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [32]:
len(pids_by_these_authors)

60369425

In [33]:
aid_year_citation_count = defaultdict(lambda: defaultdict(int))

# pub year in retracted and all control set is: [2011, 2019]
# we therefore need to get citaiton counts for: [2010, 2018]
for line in yield_one_line(mag_dir+'PaperReferences.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    pid, rid = line[0], line[1]
    if rid in pids_by_these_authors and pid in pid_year:
        year = pid_year[pid]
        record_year = max(year, 2010)
        for aid in pids_by_these_authors[rid]:
            for ii in range(record_year, 2018+1):
                aid_year_citation_count[aid][ii] += 1

processed 10000000 lines...
processed 20000000 lines...
processed 30000000 lines...
processed 40000000 lines...
processed 50000000 lines...
processed 60000000 lines...
processed 70000000 lines...
processed 80000000 lines...
processed 90000000 lines...
processed 100000000 lines...
processed 110000000 lines...
processed 120000000 lines...
processed 130000000 lines...
processed 140000000 lines...
processed 150000000 lines...
processed 160000000 lines...
processed 170000000 lines...
processed 180000000 lines...
processed 190000000 lines...
processed 200000000 lines...
processed 210000000 lines...
processed 220000000 lines...
processed 230000000 lines...
processed 240000000 lines...
processed 250000000 lines...
processed 260000000 lines...
processed 270000000 lines...
processed 280000000 lines...
processed 290000000 lines...
processed 300000000 lines...
processed 310000000 lines...
processed 320000000 lines...
processed 330000000 lines...
processed 340000000 lines...
processed 350000000 lin

In [34]:
len(aid_year_citation_count)

10170753

In [35]:
with open(fpath('aid_year_citation_count.json'), 'w') as ofile:
    for aid, info in aid_year_citation_count.items():
        row = {'aid': aid, 'citations': info}
        ofile.write(json.dumps(row) + '\n')