In [2]:
import re
import csv
import json
import pickle
import random
import requests
import pprint
import numpy as np
import pandas as pd
from scipy import stats
from langdetect import detect
from datetime import datetime
import matplotlib.pyplot as plt
from scipy.stats import chisquare
from collections import Counter, defaultdict

In [3]:
data_root = '/home/haopeng/Data/Retraction/'
altmetric_start_date = datetime.strptime('2011-06-10', '%Y-%m-%d')
altmetric_end_date = datetime.strptime('2019-10-08', '%Y-%m-%d')
CIs = {'90': 1.645, '95': 1.96, '99': 2.576}
day = 24*3600

In [4]:
blog_df = pd.read_csv(data_root + 'blog_mentions_correction.csv', header = 0)
blog_set = set(blog_df.loc[~blog_df.maybe_news.isnull(), 'blog_name'])

def correct_blogs(data, error_blogs):
    for xid, paper in data.items():
        if 'blogs' in paper['posts']:
            blogs_li = paper['posts']['blogs']
            news_li = []
            if 'news' in paper['posts']:
                news_li = paper['posts']['news']
            blogs_li_new = []
            for item in blogs_li:
                try:
                    if item['author']['name'] in error_blogs:
                        news_li.append(item)
                    else:
                        blogs_li_new.append(item)
                except:
                    pass
            paper['posts']['news'] = news_li
            paper['posts']['blogs'] = blogs_li_new
    return data

### All candidate papers in Altmetric

In [None]:
type_cn = defaultdict(int,
            {'article': 18266091,
             'chapter': 6255269,
             'book': 1490369,
             'clinical_trial_study_record': 49758,
             'news': 130360,
             'dataset': 30883,
             'Journal Article': 15,
             'JOURNAL ARTICLE': 8,
             'Editorial': 1})

# example of a chapter
{'altmetric_id': 40328044, 'counts': {'readers': {'mendeley': 0, 'citeulike': 0, 'connotea': 0}, 'total': {'posts_count': 0}}, 'citation': {'aggregated_in': [36052229], 'attribution': 'springer_nature', 'authors': ['David Spring'], 'book': {'title': 'Convex Integration Theory', 'isbns': ['9783034800594', '9783034800600'], 'attribution': 'springer_nature'}, 'doi': '10.1007/978-3-0348-0060-0_8', 'first_seen_on': '2018-05-01T07:53:03+00:00', 'issns': [], 'links': ['https://doi.org/10.1007/978-3-0348-0060-0_8'], 'ordinal_number': 8, 'pdf_url': 'https://link.springer.com/content/pdf/10.1007%2F978-3-0348-0060-0_8.pdf', 'pubdate': '1998-01-01T00:00:00+00:00', 'publisher': 'Springer, Basel', 'title': 'Ample Relations', 'type': 'chapter'}, 'altmetric_score': {'score': 0, 'score_history': {'1y': 0, '6m': 0, '3m': 0, '1m': 0, '1w': 0, '6d': 0, '5d': 0, '4d': 0, '3d': 0, '2d': 0, '1d': 0, 'at': 0}, 'context_for_score': None}, 'demographics': [], 'posts': []}

In [5]:
article_types = {'JOURNAL ARTICLE', 'Journal Article', 'article'}

In [6]:
def get_pub_year_alt(paper):
    dt_li = []
    if 'pubdate' in paper['citation']:
        dt_li.append(int(paper['citation']['pubdate'][:4]))
    if 'epubdate' in paper['citation']:
        dt_li.append(int(paper['citation']['epubdate'][:4]))
    if 'first_seen_on' in paper['citation']:
        dt_li.append(int(paper['citation']['first_seen_on'][:4]))
    if len(dt_li) == 0:
        return 0
    else:
        return min(dt_li)

In [225]:
# before running this code: combine all individual files in `oct-8th-dump.tar.gz` into `merged.txt`.

paper_li = []
cn = 0
type_cn = defaultdict(int)

with open(data_root + 'merged.txt', 'r') as ofile:
    for row in ofile:
        # this way works for both good and bad lines.
        row = row.replace('}{"altmetric_id"', '}\n{"altmetric_id"')
        records = row.split('\n')
        for record in records:
            if record != '':
                paper = json.loads(record)
                cn += 1
                if cn % 5000000 == 0:
                    print('processed %d records...'%cn)
                type_cn[paper['citation']['type']] += 1
                # focus on research articles
                if paper['citation']['type'] in article_types:
                    alt_id = paper['altmetric_id']
                    doi, journal = '', ''
                    num_authors = 0
                    pubyear = get_pub_year_alt(paper)
                    if 'doi' in paper['citation']:
                        doi = paper['citation']['doi'].lower()
                    if 'journal' in paper['citation']:
                        journal = paper['citation']['journal']
                    if 'authors' in paper['citation']:
                        num_authors = len(paper['citation']['authors'])
                    paper_li.append([alt_id, doi, pubyear, journal, num_authors])

processed 5000000 records...
processed 10000000 records...
processed 15000000 records...
processed 20000000 records...
processed 25000000 records...


In [226]:
len(paper_li)

18266114

In [227]:
print('num. of non articles: %d'%(cn-len(paper_li)))

num. of non articles: 7956640


In [230]:
paper_df = pd.DataFrame(paper_li, columns=['altmetric_id', 'doi', 'pub_year', 'journal', 'num_authors'])
del paper_li

In [231]:
len(paper_df)

18266114

In [237]:
# missing pub year
len(paper_df.loc[paper_df.pub_year <= 0])

140

In [239]:
# missing journal 
len(paper_df.loc[paper_df.journal == ''])

1958971

In [238]:
# missing author
len(paper_df.loc[paper_df.num_authors <= 0])

730914

In [249]:
# filter by year and journal, since both info are required
paper_df = paper_df.loc[(paper_df.journal != '') & (paper_df.pub_year > 0)]
paper_df.index = range(len(paper_df))

In [18]:
# paper_df.to_csv(data_root+'cand_papers.csv', index=False, header=True)
paper_df = pd.read_csv(data_root+'cand_papers.csv', header=0)

In [19]:
len(paper_df)

16307030

In [20]:
paper_df.dtypes

altmetric_id     int64
doi             object
pub_year         int64
journal         object
num_authors      int64
dtype: object

In [21]:
paper_df.head()

Unnamed: 0,altmetric_id,doi,pub_year,journal,num_authors
0,54241359,10.1158/0008-5472.can-04-0760,2004,Cancer Research,5
1,14693950,10.4202/app.00261.2016,2016,Acta Palaeontologica Polonica,2
2,3267266,10.1098/rsbm.2004.0017,2004,Biographical Memoirs of Fellows of the Royal S...,2
3,1424904,10.1176/appi.ajp.2013.13020235,2013,American Journal of Psychiatry,2
4,781925,10.1007/s10461-012-0156-7,2012,AIDS & Behavior,4


In [22]:
aid_doi = dict(zip(paper_df.altmetric_id, paper_df.doi))

In [23]:
# missing alt id
np.sum(paper_df.altmetric_id.isnull())

0

In [24]:
# missing author
len(paper_df.loc[paper_df.num_authors <= 0])

344158

In [25]:
# missing doi
np.sum(paper_df.doi.isnull())

1807004

In [26]:
alt_journals = set(paper_df.journal.to_list())

In [27]:
len(alt_journals)

107933

### Linked retracted papers

(1) load retracted papers found in Altmetric

In [7]:
doi_alt_data = {}

with open(data_root+"retraction_altmetric.json", 'r') as ffile:
    for row in ffile:
        record = json.loads(row)
        doi = record['citation']['doi'].lower()
        doi_alt_data[doi] = record

doi_alt_data = correct_blogs(doi_alt_data, blog_set)

In [8]:
# there are 6766 articles (papers) among these.
# among these articles:
# all have pub date (using three fields)
# 6738 have journal info
# 6572 have author info

len(doi_alt_data)

6838

(2) final set of retracted papers (cleaned pub date and retraction date; see `3_create_retract_dataframe.ipynb`)

In [9]:
# 158 records are removed since their corrected pub date is no earlier than retraction date.
with open(data_root+'df_retract_clean.pickle', 'rb') as file:
    df_match = pickle.load(file)

In [10]:
len(df_match)

6680

In [11]:
doi_ret_date = dict(zip(df_match['OriginalPaperDOI'], df_match['RetractionDate']))

In [12]:
df_match.head(1)

Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,...,altmetric_pub_date,pub_date_corrected,usable_days,news_media,social_media,blog,knowledge_repo,top_news,uncertainty,tweet_cn
0,23542,Hospital Admission and Readmission Among Homel...,(HSC) Medicine - Neurology;(SOC) Psychology;,Department of Neurology and Weill Institute fo...,Neurology,American Academy of Neurology,United States,Nicole Rosendale;Elan L Guterman;John P Betjem...,,Research Article;,...,2019-05-24,2019-05-24 00:00:00,137,"[2019-05-28T15:33:34, 2019-05-28T18:38:39]","[2019-05-29T11:16:24, 2019-05-29T15:54:34, 201...",[],[],[],[],"[2019-05-29T11:16:24, 2019-05-29T15:54:34, 201..."


In [13]:
np.sum(df_match[['pub_date_corrected', 'Journal', 'Author']].isnull(), axis=0)

pub_date_corrected    0
Journal               0
Author                2
dtype: int64

In [14]:
good_mat_df = []

# start from RetractionWatch data
for doi in df_match.OriginalPaperDOI.tolist():
    paper = doi_alt_data[doi]
    # exclude non articles.
    if paper['citation']['type'] == 'article':
        pubdate, aut, watch_j = df_match.loc[df_match.OriginalPaperDOI == doi][['pub_date_corrected', 'Author', 'Journal']].values.flatten()
        pub_year = int(str(pubdate)[:4])
        num_authors = 0
        if 'authors' in paper['citation']:
            num_authors = len(paper['citation']['authors'])
        if num_authors == 0 and type(aut) == str:
            num_authors = len(aut.split(';'))
        journal = ''
        if 'journal' in paper['citation']:
            journal = paper['citation']['journal']
        # use info in Watch if missing in Alt.
        if journal == '' and watch_j != '':
            journal = watch_j
        if journal != '' and num_authors > 0:
            good_mat_df.append([doi, pub_year, journal, num_authors])

In [15]:
good_mat_df = pd.DataFrame(good_mat_df, columns=['doi', 'pub_year', 'journal', 'num_authors'])

In [16]:
len(good_mat_df)

6618

In [28]:
good_mat_df.loc[~good_mat_df.journal.isin(alt_journals)]

Unnamed: 0,doi,pub_year,journal,num_authors
1123,10.31219/osf.io/q9xzj,2018,OSF Preprints,1
5873,10.1109/iceit.2010.5607537,2010,2010 International Conference on Educational a...,2
5890,10.1109/iccsit.2010.5565151,2010,2010 3rd Intl Conference on Computer Science a...,6
5892,10.1109/iccsit.2010.5564997,2010,2010 3rd Intl Conference on Computer Science a...,4
5900,10.1109/icams.2010.5553104,2010,2010 IEEE International Conference on Advanced...,4
5901,10.1109/icams.2010.5552829,2010,2010 IEEE International Conference on Advanced...,3
5902,10.1109/icams.2010.5553105,2010,2010 IEEE International Conference on Advanced...,4
5904,10.1109/icams.2010.5552962,2010,2010 IEEE International Conference on Advanced...,4
6263,10.1109/wicom.2007.1605,2007,2007 International Conference on Wireless Comm...,2
6264,10.1109/wicom.2007.1611,2007,2007 International Conference on Wireless Comm...,2


In [29]:
good_mat_df = good_mat_df.loc[good_mat_df.journal.isin(alt_journals)]
good_mat_df.index = range(len(good_mat_df))

In [30]:
# no missing data in this df!
len(good_mat_df)

6608

In [31]:
good_mat_df.head()

Unnamed: 0,doi,pub_year,journal,num_authors
0,10.1212/wnl.0000000000007645,2019,Neurology,5
1,10.1371/journal.pone.0039426,2012,PLoS ONE,5
2,10.1038/nature13375,2014,Nature,13
3,10.4103/1658-354x.168797,2015,Saudi Journal of Anaesthesia,8
4,10.4103/sja.sja_81_18,2018,Saudi Journal of Anaesthesia,8


In [32]:
good_mat_df.dtypes

doi            object
pub_year        int64
journal        object
num_authors     int64
dtype: object

## Matching control papers

In [34]:
# first get all candidate papers in the same journal and year as each retracted paper.
doi_control_aids_all = dict()

for ix in range(len(good_mat_df)):
    doi, pub_year, journal, num_authors = good_mat_df.loc[ix]
    query_str = 'pub_year==%d & journal=="%s"'%(pub_year, journal)
    sub_df = paper_df.query(query_str)
    sub_aids = sub_df.altmetric_id.tolist()
    if len(sub_aids) > 0:
        doi_control_aids_all[doi] = sub_aids
    if (ix+1)%100 == 0:
        print('processed %d papers...'%(ix+1))

processed 100 papers...
processed 200 papers...
processed 300 papers...
processed 400 papers...
processed 500 papers...
processed 600 papers...
processed 700 papers...
processed 800 papers...
processed 900 papers...
processed 1000 papers...
processed 1100 papers...
processed 1200 papers...
processed 1300 papers...
processed 1400 papers...
processed 1500 papers...
processed 1600 papers...
processed 1700 papers...
processed 1800 papers...
processed 1900 papers...
processed 2000 papers...
processed 2100 papers...
processed 2200 papers...
processed 2300 papers...
processed 2400 papers...
processed 2500 papers...
processed 2600 papers...
processed 2700 papers...
processed 2800 papers...
processed 2900 papers...
processed 3000 papers...
processed 3100 papers...
processed 3200 papers...
processed 3300 papers...
processed 3400 papers...
processed 3500 papers...
processed 3600 papers...
processed 3700 papers...
processed 3800 papers...
processed 3900 papers...
processed 4000 papers...
processed

In [275]:
# e.g., if a retracted paper has only doi no pubdate in the Alt db, it won't be in paper_df, but it will be in good_mat_df with pubdate from RW db.
good_mat_df.loc[~good_mat_df.doi.isin(paper_df.doi)]


Unnamed: 0,doi,pub_year,journal,num_authors
2558,10.1097/00005537-199904000-00028,1999,The Laryngoscope,3
2560,10.1097/00005537-199912000-00017,1999,The Laryngoscope,3
2625,10.1002/14651858.cd004142.pub2,2010,Cochrane Database of Systematic Reviews,2
3712,10.1002/14651858.cd007025.pub2,2014,Cochrane Database of Systematic Reviews,5
3821,10.1002/14651858.cd009008.pub2,2013,Cochrane Database of Systematic Reviews,9
3978,10.1002/14651858.cd001364.pub4,2013,Cochrane Database of Systematic Reviews,2
4825,10.1136/bcr-2012-007666,2012,BMJ Case Reports,4
5807,10.1109/icise.2010.5688639,2010,The 2nd International Conference on Informatio...,4
5859,10.1109/iccasm.2010.5620213,2010,2010 International Conference on Computer Appl...,4
5871,10.1109/icife.2010.5609505,2010,2010 2nd IEEE International Conference on Info...,1


In [271]:
len(doi_control_aids_all)

6599

In [301]:
len(doi_control_aids_all)/len(good_mat_df)

0.9986380145278451

In [299]:
[doi for doi in good_mat_df.doi if doi not in doi_control_aids_all]

['10.11532/structcivil.58a.250',
 '10.1299/mer.17-00374',
 '10.1002/14651858.cd004142.pub2',
 '10.1002/14651858.cd007025.pub2',
 '10.1002/14651858.cd009008.pub2',
 '10.1002/14651858.cd001364.pub4',
 '10.1016/j.bbagrm.2013.07.001',
 '10.5430/jnep.v2n4p145',
 '10.1016/s2211-9477(11)70001-2']

In [215]:
# exclude retracted papers from control set.
doi_control_aids_all = {doi: [aid for aid in aids if aid_doi[aid] not in doi_alt_data] for doi, aids in doi_control_aids_all.items()}

In [234]:
doi_control_aids = {}
aids_seen = set()
num_c = 5

for doi, aids in doi_control_aids_all.items():
    dod = [aid for aid in aids if aid not in aids_seen]
    if len(dod) >= num_c:
        res = random.sample(dod, num_c)
        doi_control_aids[doi] = res
        for aid in res:
            aids_seen.add(aid)

In [241]:
with open(data_root+'doi_control_alt_ids.json', 'w') as ofile:
    for doi, aids in doi_control_aids.items():
        line = {'doi': doi, 'alt_ids': aids}
        ofile.write(json.dumps(line) + '\n')

In [242]:
doi_control_aids ={}

with open(data_root+"doi_control_alt_ids.json", 'r') as ffile:
    for line in ffile:
        line = json.loads(line)
        doi_control_aids[line['doi']] = [int(x) for x in line['alt_ids']]

In [243]:
len(doi_control_aids)

6367

In [300]:
len(doi_control_aids)/len(good_mat_df)

0.9635290556900726

Get control papers from Altmetric

In [236]:
alt_ids = set()
for doi, aids in doi_control_aids.items():
    for aid in aids:
        alt_ids.add(aid)

In [237]:
len(alt_ids)

31835

In [244]:
control_li = []
cn = 0

with open(data_root + 'merged.txt', 'r') as ofile:
    for row in ofile:
        row = row.replace('}{"altmetric_id"', '}\n{"altmetric_id"')
        # this way works for both good and bad lines.
        records = row.split('\n')
        for record in records:
            if record != '':
                paper = json.loads(record)
                cn += 1
                if cn % 5000000 == 0:
                    print('processed %d records...'%cn)
                alt_id = paper['altmetric_id']
                if alt_id in alt_ids:
                    control_li.append(paper)

processed 5000000 records...
processed 10000000 records...
processed 15000000 records...
processed 20000000 records...
processed 25000000 records...


In [245]:
len(control_li)

31835

In [246]:
with open(data_root+"control_alt_papers.json", 'w') as ffile:
    for paper in control_li:
        ffile.write(json.dumps(paper) + '\n')

Run `2_crawl_tweets.ipynb` to retrieve tweets for these control papers