In [1]:
import re
import csv
import json
import pickle
import random
import requests
import pprint
import numpy as np
import pandas as pd
from scipy import stats
from langdetect import detect
from datetime import datetime
import matplotlib.pyplot as plt
from scipy.stats import chisquare
from collections import Counter, defaultdict

In [2]:
data_root = '/home/haopeng/Data/Retraction/'
altmetric_start_date = datetime.strptime('2011-06-10', '%Y-%m-%d')
altmetric_end_date = datetime.strptime('2019-10-08', '%Y-%m-%d')
CIs = {'90': 1.645, '95': 1.96, '99': 2.576}
day = 24*3600

In [3]:
blog_df = pd.read_csv(data_root + 'blog_mentions_correction.csv', header = 0)
blog_set = set(blog_df.loc[~blog_df.maybe_news.isnull(), 'blog_name'])

def correct_blogs(data, error_blogs):
    for xid, paper in data.items():
        if 'blogs' in paper['posts']:
            blogs_li = paper['posts']['blogs']
            news_li = []
            if 'news' in paper['posts']:
                news_li = paper['posts']['news']
            blogs_li_new = []
            for item in blogs_li:
                try:
                    if item['author']['name'] in error_blogs:
                        news_li.append(item)
                    else:
                        blogs_li_new.append(item)
                except:
                    pass
            paper['posts']['news'] = news_li
            paper['posts']['blogs'] = blogs_li_new
    return data

### All candidate papers in Altmetric

In [4]:
article_types = {'JOURNAL ARTICLE', 'Journal Article', 'article'}

In [126]:
paper_df = pd.read_csv(data_root+'cand_papers.csv', header=0)

In [127]:
len(paper_df)

16307030

In [128]:
paper_df.head()

Unnamed: 0,altmetric_id,doi,pub_year,journal,num_authors
0,54241359,10.1158/0008-5472.can-04-0760,2004,Cancer Research,5
1,14693950,10.4202/app.00261.2016,2016,Acta Palaeontologica Polonica,2
2,3267266,10.1098/rsbm.2004.0017,2004,Biographical Memoirs of Fellows of the Royal S...,2
3,1424904,10.1176/appi.ajp.2013.13020235,2013,American Journal of Psychiatry,2
4,781925,10.1007/s10461-012-0156-7,2012,AIDS & Behavior,4


In [129]:
alt_journals = set(paper_df.journal.to_list())

In [130]:
len(alt_journals)

107933

In [131]:
# missing doi
np.sum(paper_df.doi.isnull())

1807004

In [132]:
# no missing data in this set!
paper_df = paper_df.loc[~paper_df.doi.isnull()]
paper_df.index = range(len(paper_df))

In [133]:
len(paper_df)

14500026

In [134]:
aid_doi = dict(zip(paper_df.altmetric_id, paper_df.doi))

### Linked retracted papers

(1) retracted papers found in Altmetric

In [139]:
doi_alt_data = {}

with open(data_root+"retraction_altmetric.json", 'r') as ffile:
    for row in ffile:
        record = json.loads(row)
        doi = record['citation']['doi'].lower()
        doi_alt_data[doi] = record

doi_alt_data = correct_blogs(doi_alt_data, blog_set)

In [140]:
# there are 6766 articles (papers) among these.
# among these articles:
# all have pub date (using three fields)
# 6738 have journal info
# 6572 have author info

len(doi_alt_data)

6838

(2) final set of retracted papers (cleaned pub date and retraction date)

In [141]:
# 158 records are removed since their corrected pub date is no earlier than retraction date.
# may contain non research articles.
# also may have missing info for authors and journal etc.
with open(data_root+'df_retract_clean.pickle', 'rb') as file:
    df_match = pickle.load(file)

In [142]:
len(df_match)

6680

In [143]:
doi_ret_date = dict(zip(df_match['OriginalPaperDOI'], df_match['RetractionDate']))

In [144]:
df_match.head(1)

Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,...,usable_days,tweet_ids,tw_date_sort,fcn_date_sort,uncertainty,news_media,social_media,blog,knowledge_repo,top_news
0,23542,Hospital Admission and Readmission Among Homel...,(HSC) Medicine - Neurology;(SOC) Psychology;,Department of Neurology and Weill Institute fo...,Neurology,American Academy of Neurology,United States,Nicole Rosendale;Elan L Guterman;John P Betjem...,,Research Article;,...,137,"[1133693569737187329, 1133763571790127106, 113...","[2019-05-29T11:16:24, 2019-05-29T15:54:34, 201...","[506, 273, 4836, 1845, 48163, 181, 935, 1322, ...",[],"[2019-05-28T15:33:34, 2019-05-28T18:38:39]","[2019-05-29T11:16:24, 2019-05-29T15:54:34, 201...",[],[],[]


In [145]:
np.sum(df_match[['pub_date_corrected', 'Journal', 'Author']].isnull(), axis=0)

pub_date_corrected    0
Journal               0
Author                2
dtype: int64

In [146]:
good_mat_df = []

# start from RW data
for doi in df_match.OriginalPaperDOI.tolist():
    paper = doi_alt_data[doi]
    # exclude non articles.
    if paper['citation']['type'] == 'article':
        pubdate, aut, watch_j = df_match.loc[df_match.OriginalPaperDOI == doi][['pub_date_corrected', 'Author', 'Journal']].values.flatten()
        pub_year = int(str(pubdate)[:4])
        num_authors = 0
        if 'authors' in paper['citation']:
            num_authors = len(paper['citation']['authors'])
        if num_authors == 0 and type(aut) == str:
            num_authors = len(aut.split(';'))
        journal = ''
        if 'journal' in paper['citation']:
            journal = paper['citation']['journal']
        # use info in Watch if missing in Alt.
        if journal == '' and watch_j != '':
            journal = watch_j
        if journal != '' and num_authors > 0:
            good_mat_df.append([doi, pub_year, journal, num_authors])

In [147]:
good_mat_df = pd.DataFrame(good_mat_df, columns=['doi', 'pub_year', 'journal', 'num_authors'])

In [148]:
len(good_mat_df)

6618

In [149]:
good_mat_df.loc[~good_mat_df.journal.isin(alt_journals)]

Unnamed: 0,doi,pub_year,journal,num_authors
1123,10.31219/osf.io/q9xzj,2018,OSF Preprints,1
5873,10.1109/iceit.2010.5607537,2010,2010 International Conference on Educational a...,2
5890,10.1109/iccsit.2010.5565151,2010,2010 3rd Intl Conference on Computer Science a...,6
5892,10.1109/iccsit.2010.5564997,2010,2010 3rd Intl Conference on Computer Science a...,4
5900,10.1109/icams.2010.5553104,2010,2010 IEEE International Conference on Advanced...,4
5901,10.1109/icams.2010.5552829,2010,2010 IEEE International Conference on Advanced...,3
5902,10.1109/icams.2010.5553105,2010,2010 IEEE International Conference on Advanced...,4
5904,10.1109/icams.2010.5552962,2010,2010 IEEE International Conference on Advanced...,4
6263,10.1109/wicom.2007.1605,2007,2007 International Conference on Wireless Comm...,2
6264,10.1109/wicom.2007.1611,2007,2007 International Conference on Wireless Comm...,2


In [150]:
good_mat_df = good_mat_df.loc[good_mat_df.journal.isin(alt_journals)]
good_mat_df.index = range(len(good_mat_df))

In [151]:
# no missing data in this df!
len(good_mat_df)

6608

In [152]:
good_mat_df.head()

Unnamed: 0,doi,pub_year,journal,num_authors
0,10.1212/wnl.0000000000007645,2019,Neurology,5
1,10.1371/journal.pone.0039426,2012,PLoS ONE,5
2,10.1038/nature13375,2014,Nature,13
3,10.4103/1658-354x.168797,2015,Saudi Journal of Anaesthesia,8
4,10.4103/sja.sja_81_18,2018,Saudi Journal of Anaesthesia,8


In [153]:
# later on in the analysis (`6_retract_vs_control_revision.ipynb`), we further filter based on Alt database start month, day, year.
good_mat_df = good_mat_df.loc[good_mat_df.pub_year >= 2011]
good_mat_df.index = range(len(good_mat_df))

In [154]:
len(good_mat_df)

4343

In [155]:
paper_df = paper_df.loc[paper_df.pub_year >= 2011]
paper_df.index = range(len(paper_df))

In [156]:
len(paper_df)

8571057

In [157]:
np.max(good_mat_df.pub_year)

2019

In [158]:
np.max(paper_df.pub_year)

2019

### Combine dois (to get info in MAG)

In [37]:
dois = set()

for doi in paper_df.doi:
    dois.add(doi.lower())
for doi in good_mat_df.doi:
    dois.add(doi.lower())

In [38]:
len(dois)

8569168

In [268]:
# Run `mag.ipynb` to obtain paper metadata for these papers from MAG
with open(data_root+'revision/alt_dois.pickle', 'wb') as ofile:
    pickle.dump(dois, ofile)

### Add more controls before matching

In [94]:
def yield_one_line(filename, delimiter = '\t', quote = csv.QUOTE_NONE):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        print('processing %s...' %(filename))
        reader = csv.reader(file, delimiter=delimiter, quoting=quote)
        for row in reader:
            yield row

In [205]:
affi_rank = {}

# Affiliations.txt comes from MAG
for line in yield_one_line(data_root+'revision/Affiliations.txt'):
    affi_id, rank, dname, lat, lon = line[0], line[1], line[3], line[9], line[10]
    affi_rank[affi_id] = int(rank)

processing /home/haopeng/Data/Retraction/revision/Affiliations.txt...


In [206]:
affi_rank['2913133470']

19238

In [42]:
doi_author_list = {}

with open(data_root+'revision/dois_authors_mag.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        doi, authors = row['doi'], row['authors']
        doi_author_list[doi] = authors

In [123]:
len(doi_author_list)

8725412

In [45]:
aid_year_citation_count = {}

with open(data_root+'revision/aid_year_citation_count.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        aid, metric = row['aid'], row['citations']
        metric = {int(year): cn for year, cn in metric.items() if int(year) >= 2010}
        aid_year_citation_count[aid] = metric

In [122]:
len(aid_year_citation_count)

10170753

In [202]:
def get_doi_max_author_cite(row):
    doi, year = row['doi'], int(row['pub_year'])
    if doi in doi_author_list:
        cites = []
        for aid, affi_id, seq, name in doi_author_list[doi]:
            if aid in aid_year_citation_count:
                if (year-1) in aid_year_citation_count[aid]:
                    cites.append(aid_year_citation_count[aid][year-1])
                else:
                    cites.append(0)
        if len(cites) > 0:
            return max(cites)
        else:
            return 0
    else:
        return np.NaN
    
def get_doi_max_affi_rank(doi):
    if doi in doi_author_list:
        ranks = []
        # an author can have multiple affi_ids
        for aid, affi_id, seq, name in doi_author_list[doi]:
            if affi_id in affi_rank:
                ranks.append(affi_rank[affi_id])
        if len(ranks) > 0:
            return min(ranks)
        else:
            return np.NaN
    else:
        return np.NaN

In [207]:
good_mat_df['top_author_cite'] = good_mat_df.apply(get_doi_max_author_cite, axis=1)
good_mat_df['top_affiliation_rank'] = good_mat_df['doi'].apply(get_doi_max_affi_rank)

In [208]:
len(good_mat_df.loc[~good_mat_df.doi.isin(doi_author_list)])

364

In [209]:
len(good_mat_df.loc[good_mat_df.top_author_cite.isna()])

364

In [210]:
# pd.set_option('display.max_rows', 2000)
good_mat_df.loc[good_mat_df.top_affiliation_rank.isna()]

Unnamed: 0,doi,pub_year,journal,num_authors,top_author_cite,top_affiliation_rank
10,10.1007/s11274-015-1821-6,2015,World Journal of Microbiology & Biotechnology,4,27.0,
12,10.1038/cgt.2015.6,2015,Cancer Gene Therapy,5,0.0,
15,10.1016/j.scr.2018.07.008,2018,Stem Cell Research,10,26890.0,
16,10.1002/jcp.29146,2019,Journal of Cellular Physiology,3,,
22,10.1186/s40425-019-0637-6,2019,Journal for Immunotherapy of Cancer,5,,
...,...,...,...,...,...,...
4326,10.1109/icbmei.2011.5917937,2011,2011 International Conference on Business Mana...,2,0.0,
4330,10.1016/j.proeng.2011.04.229,2011,Procedia Engineering,3,0.0,
4334,10.1016/j.ces.2011.02.014,2011,Chemical Engineering Science,5,,
4335,10.3109/09537104.2010.547958,2011,Platelets,3,84.0,


In [126]:
doi_author_list['10.1038/ncomms8956']

[['1923040749', '', '2', 'E. Tiourin'],
 ['2047678383', '', '4', 'D. Y. Paik'],
 ['2118413852', '', '6', 'M. Pellegrini'],
 ['2311268613', '', '1', 'D. M. Janzen'],
 ['2342135843', '', '5', 'J. Lu'],
 ['2475107784', '', '7', 'S. Memarzadeh'],
 ['2639235885', '', '3', 'J. A. Salehi']]

In [211]:
good_mat_df = good_mat_df[['doi', 'pub_year', 'journal', 'num_authors', 'top_author_cite']]
good_mat_df = good_mat_df.dropna(axis=0)
good_mat_df.index = range(len(good_mat_df))

In [212]:
len(good_mat_df)

3979

In [213]:
good_mat_df.head()

Unnamed: 0,doi,pub_year,journal,num_authors,top_author_cite
0,10.1212/wnl.0000000000007645,2019,Neurology,5,2106.0
1,10.1371/journal.pone.0039426,2012,PLoS ONE,5,393.0
2,10.1038/nature13375,2014,Nature,13,12339.0
3,10.4103/1658-354x.168797,2015,Saudi Journal of Anaesthesia,8,20.0
4,10.4103/sja.sja_81_18,2018,Saudi Journal of Anaesthesia,8,126.0


In [221]:
paper_df['top_author_cite'] = paper_df.apply(get_doi_max_author_cite, axis=1)

In [219]:
len(paper_df)

8571057

In [220]:
paper_df.head()

Unnamed: 0,altmetric_id,doi,pub_year,journal,num_authors,top_author_cite
0,14693950,10.4202/app.00261.2016,2016,Acta Palaeontologica Polonica,2,592.0
1,1424904,10.1176/appi.ajp.2013.13020235,2013,American Journal of Psychiatry,2,5586.0
2,781925,10.1007/s10461-012-0156-7,2012,AIDS & Behavior,4,1815.0
3,3721487,10.1056/nejm-jw.na35186,2014,Journal Watch,1,678.0
4,15338148,10.1021/acs.jctc.6b01085,2017,Journal of Chemical Theory and Computation,2,4274.0


In [222]:
paper_df = paper_df.dropna(axis=0)
paper_df.index = range(len(paper_df))

In [223]:
len(paper_df)

7864302

In [227]:
paper_df['top_author_cite_perc'] = paper_df['top_author_cite'].rank(pct=True)

In [225]:
paper_df.head()

Unnamed: 0,altmetric_id,doi,pub_year,journal,num_authors,top_author_cite,top_author_cite_perc
0,14693950,10.4202/app.00261.2016,2016,Acta Palaeontologica Polonica,2,592.0,0.352438
1,1424904,10.1176/appi.ajp.2013.13020235,2013,American Journal of Psychiatry,2,5586.0,0.767297
2,781925,10.1007/s10461-012-0156-7,2012,AIDS & Behavior,4,1815.0,0.539324
3,3721487,10.1056/nejm-jw.na35186,2014,Journal Watch,1,678.0,0.370974
4,15338148,10.1021/acs.jctc.6b01085,2017,Journal of Chemical Theory and Computation,2,4274.0,0.714708


In [226]:
top_author_cite_perc_dict = dict(zip(paper_df['top_author_cite'], paper_df['top_author_cite_perc']))

## (v6) Find matches (journal, year, num of authors, author citation)

In [108]:
# first get all candidate papers
doi_control_aids_all = dict()

for ix in range(len(good_mat_df)):
    doi, pub_year, journal, num_authors, author_cite, affi_rank = good_mat_df.loc[ix]
    author_cite_p = top_author_cite_perc_dict[author_cite]
    query_str = 'journal=="%s" & pub_year>=%d & pub_year<=%d & num_authors>=%d & num_authors<=%d & top_author_cite_perc>=%f & top_author_cite_perc<=%f'%(journal, pub_year-2, pub_year+2, num_authors-2, num_authors+2, author_cite_p-0.2, author_cite_p+0.2)
    sub_df = paper_df.query(query_str)
    sub_aids = sub_df.altmetric_id.tolist()
    if len(sub_aids) > 0:
        doi_control_aids_all[doi] = sub_aids
    if (ix+1)%100 == 0:
        print('processed %d papers...'%(ix+1))

processed 100 papers...
processed 200 papers...
processed 300 papers...
processed 400 papers...
processed 500 papers...
processed 600 papers...
processed 700 papers...
processed 800 papers...
processed 900 papers...
processed 1000 papers...
processed 1100 papers...
processed 1200 papers...
processed 1300 papers...
processed 1400 papers...
processed 1500 papers...
processed 1600 papers...
processed 1700 papers...
processed 1800 papers...
processed 1900 papers...
processed 2000 papers...
processed 2100 papers...
processed 2200 papers...
processed 2300 papers...
processed 2400 papers...
processed 2500 papers...
processed 2600 papers...
processed 2700 papers...
processed 2800 papers...


## (v7) Find matches (additionally considering affiliation rank)

In [108]:
# first get all candidate papers
doi_control_aids_all = dict()

for ix in range(len(good_mat_df)):
    doi, pub_year, journal, num_authors, author_cite, affi_rank = good_mat_df.loc[ix]
    author_cite_p = top_author_cite_perc_dict[author_cite]
    affi_rank_p = top_affi_rank_perc_dict[affi_rank]
    query_str = 'journal=="%s" & pub_year>=%d & pub_year<=%d & num_authors>=%d & num_authors<=%d & top_author_cite_perc>=%f & top_author_cite_perc<=%f & top_affiliation_rank_perc>=%f & top_affiliation_rank_perc<=%f'%(journal, pub_year-2, pub_year+2, num_authors-2, num_authors+2, author_cite_p-0.2, author_cite_p+0.2, affi_rank_p-0.2, affi_rank_p+0.2)
    sub_df = paper_df.query(query_str)
    sub_aids = sub_df.altmetric_id.tolist()
    if len(sub_aids) > 0:
        doi_control_aids_all[doi] = sub_aids
    if (ix+1)%100 == 0:
        print('processed %d papers...'%(ix+1))

processed 100 papers...
processed 200 papers...
processed 300 papers...
processed 400 papers...
processed 500 papers...
processed 600 papers...
processed 700 papers...
processed 800 papers...
processed 900 papers...
processed 1000 papers...
processed 1100 papers...
processed 1200 papers...
processed 1300 papers...
processed 1400 papers...
processed 1500 papers...
processed 1600 papers...
processed 1700 papers...
processed 1800 papers...
processed 1900 papers...
processed 2000 papers...
processed 2100 papers...
processed 2200 papers...
processed 2300 papers...
processed 2400 papers...
processed 2500 papers...
processed 2600 papers...
processed 2700 papers...
processed 2800 papers...


In [109]:
# exclude retracted papers from control set.
doi_control_aids_all = {doi: [aid for aid in aids if aid_doi[aid] not in doi_alt_data] for doi, aids in doi_control_aids_all.items()}

for doi in list(doi_control_aids_all.keys()):
    if len(doi_control_aids_all[doi]) == 0:
        del doi_control_aids_all[doi]

In [110]:
len(doi_control_aids_all)

2828

In [111]:
len(doi_control_aids_all)/len(good_mat_df)

0.9901960784313726

In [112]:
doi_control_aids = {}
aids_seen = set()
num_c = 5

for doi, aids in doi_control_aids_all.items():
    dod = [aid for aid in aids if aid not in aids_seen]
    if len(dod) >= num_c:
        res = random.sample(dod, num_c)
        doi_control_aids[doi] = res
        for aid in res:
            aids_seen.add(aid)

In [113]:
len(doi_control_aids)

2630

In [114]:
len(doi_control_aids)/len(good_mat_df)

0.9208683473389355

In [115]:
with open(data_root+'revision/doi_control_alt_ids_revision_v7.json', 'w') as ofile:
    for doi, aids in doi_control_aids.items():
        line = {'doi': doi, 'alt_ids': aids}
        ofile.write(json.dumps(line) + '\n')

Get control papers from Altmetric

In [116]:
alt_ids = set()
for doi, aids in doi_control_aids.items():
    for aid in aids:
        alt_ids.add(aid)

In [117]:
len(alt_ids)

13150

In [118]:
control_li = []
cn = 0

with open(data_root + 'merged.txt', 'r') as ofile:
    for row in ofile:
        row = row.replace('}{"altmetric_id"', '}\n{"altmetric_id"')
        # this way works for both good and bad lines.
        records = row.split('\n')
        for record in records:
            if record != '':
                paper = json.loads(record)
                cn += 1
                if cn % 5000000 == 0:
                    print('processed %d records...'%cn)
                alt_id = paper['altmetric_id']
                if alt_id in alt_ids:
                    control_li.append(paper)

processed 5000000 records...
processed 10000000 records...
processed 15000000 records...
processed 20000000 records...
processed 25000000 records...


In [119]:
len(control_li)

13150

In [None]:
with open(data_root+"revision/control_alt_papers_revision_v7.json", 'w') as ffile:
    for paper in control_li:
        ffile.write(json.dumps(paper) + '\n')

## (v6) Get the top author's citation count for each paper (at pub year)

This data will be used in the regression analysis that controls for author's log citations

In [124]:
doi_control_aids ={}

with open(data_root+"revision/doi_control_alt_ids_revision_v6.json", 'r') as ffile:
    for line in ffile:
        line = json.loads(line)
        doi_control_aids[line['doi']] = [int(x) for x in line['alt_ids']]

In [125]:
len(doi_control_aids)

3851

In [175]:
tem = good_mat_df.loc[good_mat_df.doi.isin(doi_control_aids)].copy()
tem.index = range(len(tem))
tem['top_author_cite'] = tem.apply(get_doi_max_author_cite, axis=1)

In [176]:
tem

Unnamed: 0,doi,pub_year,journal,num_authors,top_author_cite
0,10.1212/wnl.0000000000007645,2019,Neurology,5,2106
1,10.1371/journal.pone.0039426,2012,PLoS ONE,5,393
2,10.1038/nature13375,2014,Nature,13,12339
3,10.4103/1658-354x.168797,2015,Saudi Journal of Anaesthesia,8,20
4,10.4103/sja.sja_81_18,2018,Saudi Journal of Anaesthesia,8,126
...,...,...,...,...,...
3846,10.1016/j.aml.2011.01.019,2011,Applied Mathematics Letters,1,303
3847,10.1016/j.procs.2010.12.074,2011,Procedia Computer Science,2,0
3848,10.1016/j.amc.2011.02.038,2011,Applied Mathematics & Computation,4,17
3849,10.1371/journal.pone.0016011,2011,PLoS ONE,3,1069


In [166]:
tem_aids = set()
for doi, aids in doi_control_aids.items():
    for aid in aids:
        tem_aids.add(aid)

In [167]:
len(tem_aids)

19255

In [169]:
tem_1 = paper_df.loc[paper_df.altmetric_id.isin(tem_aids)].copy()
tem_1.index = range(len(tem_1))
tem_1['top_author_cite'] = tem_1.apply(get_doi_max_author_cite, axis=1)

In [173]:
tem_1

Unnamed: 0,altmetric_id,doi,pub_year,journal,num_authors,top_author_cite
0,21408520,10.1111/1440-1681.12806,2017,Clinical & Experimental Pharmacology & Physiology,3,0
1,4471406,10.1186/s40478-015-0232-0,2015,Acta Neuropathologica Communications,15,31182
2,4658129,10.1186/s40001-014-0068-2,2014,European Journal of Medical Research,6,22
3,33378280,10.1155/2013/353270,2013,BioMed Research International,5,2626
4,10860533,10.1111/phor.12153,2016,Photogrammetric Record,3,170
...,...,...,...,...,...,...
19250,2577967,10.1139/bcb-2013-0127,2014,Biochemistry and Cell Biology,2,0
19251,6122978,10.1007/s12035-016-9754-0,2016,Molecular Neurobiology,7,88
19252,6705886,10.1021/acs.oprd.5b00139,2015,Organic Process Research & Development,6,784
19253,52445630,10.1002/mbo3.764,2018,MicrobiologyOpen,6,13367


In [179]:
tem_comb = pd.concat([tem[['doi', 'top_author_cite']], tem_1[['doi', 'top_author_cite']]])

In [180]:
len(tem_comb)

23106

In [181]:
doi_top_author_cite_upon_pub = dict(zip(tem_comb.doi, tem_comb.top_author_cite))

In [182]:
len(doi_top_author_cite_upon_pub)

23105

In [183]:
with open(data_root+'revision/doi_top_author_cite_upon_pub.json', 'w') as ofile:
    for doi, cite in doi_top_author_cite_upon_pub.items():
        row = {'doi': doi, 'top_author_cite': cite}
        ofile.write(json.dumps(row) + '\n')