### STEP 4: Data Collection of Retraction Year
This file collects retraction years for retracted items for the following sources:
1. Retraction Watch
2. PubMed
3. Web of Science platform: i. BIOABS ii. BCI iii. CCC and iv. Web of Science Core
4. Crossref 

Input Files:
- unionlist/unionlist_with_nodoi_{date}.csv
- webofscience/bci_retractedpublication_{date}.csv
- webofscience/bioabs_retractedpublication_{date}.csv
- webofscience/ccc_retractedpublication_{date}.csv
- webofscience/webofsciencecore_retractedpublication_{date}.csv
- crossref/crossref_recordswithdoi_{date}.csv
- retractionwatch/retractionwatch_{date}.csv

Output File:
- pubmed/pubmed_retractionyear_{date}.csv
- webofscience/bci_retractionyear_{date}.csv
- webofscience/bioabs_retractionyear_{date}.csv
- webofscience/ccc_retractionyear_{date}.csv
- webofscience/webofsciencecore_retractionyear_{date}.csv
- crossref/crossref_retractionyear_{date}.csv
- unionlist/unionlist_with_retractionyear_{date}.csv


In [None]:
import requests
import time,datetime
from bs4 import BeautifulSoup as bs

import os
import re
import csv
import numpy as np
import unicodedata
import json

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

### Directory Setup

In [None]:
# Targeting the retraction_index_path
retraction_index_path = os.path.abspath('./.')
retraction_index_path

data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'

### Configuration File

In [None]:
# Load configuration
con_file = open(retraction_index_path+"/config.json")
config = json.load(con_file)
con_file.close()

# Initializing variable for configuration file
my_email = config['my_email']
elsevier_api_key = config['Elsevier_APIKEY']
elsevier_insttoken = config['insttoken']
ieee_xplore_api_key = config['IEEEXplore_APIKEY']
wos_api_key = config['WoS_APIKEY']

In [None]:
# Global initializatiion
global my_email
global elsevier_api_key
global elsevier_insttoken
global ieee_xplore_api_key
global wos_api_key

In [None]:
def convert_unicode(string: str) -> str:
    """
    It takes a string and passes it through different encoding parameter phases
    E.g. '10.\u200b1105/\u200btpc.\u200b010357' ->  '10.1105/tpc.010357'
    
    :param string: variable to be encoded
    :return: the actual string value devoided of encoded character
    """
    
    string = unicodedata.normalize('NFKD', string).encode('iso-8859-1', 'ignore').decode('iso-8859-1')
    string = unicodedata.normalize('NFKD', string).encode('latin1', 'ignore').decode('latin1')
    string = unicodedata.normalize('NFKD', string).encode('cp1252', 'ignore').decode('cp1252')
    return string

In [None]:
def batch_items(pmids:list, cut:int)-> list[list]:
    """
    It divides the list pmids into batches for processing. 
    :param pmids: list of pmids 
    :param cut: maximum number of records to assign to a batch
    
    :return: list of list of batches of pmids
    """
    pmids_batches=[]
    
    while len(pmids) >= cut:
        selected_pmids= pmids[:cut]
        pmids_batches.append(selected_pmids)
#         print(selected_pmids)    
        pmids = pmids[cut:]

    if pmids:
        pmids_batches.append(pmids)
#         print(pmids)

    return pmids_batches
    

In [None]:
"""
# Inputting last date when the retraction publication were collected each source as stated in STEP 1 notebook
"""
getdate = {'scopus': '2024-07-05',
            'crossref':'2024-07-03',
            'retractionwatch': '2024-07-03',
            'pubmed': '2024-07-03',
           
            'geobase': '2024-07-05',
            'compendex': '2024-07-09',
                
            'bci': '2024-07-03',
            'bioabs': '2024-07-03',
            'ccc': '2024-07-03',
            'medline': '2024-07-03',
            'webofsciencecore': '2024-07-03',
          
            'unionlist':'2024-07-09'}


In [None]:
today = str(datetime.date.today())
today

# PubMed

In [None]:
def retrieve_pmids(term:str,mindate:int, maxdate:int)->list:
    """
    It retrieves pmids for a given search term.
    
    :param term: search term
    :param mindate: the year to start the search
    :param maxdate: the year to end the search
    
    :return: list of all pmids of the records retrieved
    """
    
    api_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    
    # Step 1: Search for retracted papers
    
    email = my_email # Supply your email
    
    params = {
        "db": "pubmed",
        "term": term,
        "retmode": "json",
        "retmax": 10000,  # Maximum number of results per request
        "mindate": mindate,
        "maxdate": maxdate
    }

    # Step 2: Send a GET request to the PubMed API to search for retracted papers
    response = requests.get(api_url, params=params)
    data = response.json()
    total_results = int(data["esearchresult"]["count"])
    pmids = data["esearchresult"]["idlist"]
    return total_results, pmid

In [None]:
def fetch_all_pmids(term:str, start_year:int, end_year:int, interval_year:int)->list:
    """
    It retrieves pmids for a given search term over a period of time using retrieve_pmids function. 
    It re-iterates at a defined interval year because up to 10,000 records maximum can retrieved 
    from PubMed at a time. Check: https://www.ncbi.nlm.nih.gov/books/NBK25501/ for details
    
    :param term: search term
    :param start_year: the year to start the search
    :param end_year: the year to end the search
    :param interval_year: interval between year batches
    
    :return: list of all pmids of the records retrieved
    """
    all_pmids = []
    total_pmids_count = 0
    current_year = end_year
    
    # Iterate over the years with a stipulated year interval of 10,000 records maximum limitation
    for year in range(start_year, end_year +1, interval_year):
        end_year = year + interval_year-1
        if (current_year - year) < interval_year:
            end_year = current_year
        
        count, pmids_per_interval = retrieve_pmids(term,year,end_year)
        total_pmids_count+=count
        all_pmids+=pmids_per_interval
        
        print(f'{year} - {end_year}: {count} total number of retrieved pmids')
        
    return total_pmids_count,all_pmids

In [None]:
def retrieve_retracted_paper_data_from_metadata(pmid:list):
    """
    It retrieves XML of a given pmid
    
    :param pmid: the pmid of a given publication
    :return: XML of the pmid of a given publication
    """
    
    #api_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"


    email = my_email       # Supply your email
    
#   Retrieve paper's XML details
    params = {
            "db": "pubmed",
            "id": pmid,
            "retmode": "xml"
        }

    # Process the XML response to extract the desired paper details
    # (e.g., title, authors, abstract, etc.)
    response = requests.get(efetch_url, params=params)
    paper_xml = response.text
    
    return paper_xml


In [None]:
def extract_retraction_notice_metadata(soup:bs)->list:
    """
    It extracts data from the XML metadata for retraction notice
    
    :param soup: article in Beautifulsoup XML format
    
    :return: list of extracted data from the XML metadata
    """ 
    # Initialize all variables
    rn_pmid, doi = '',''   # retraction notice paper's pmid and doi
    pub_year, year, month, day = '','9999','99','99'   # publication of the retracted paper
    jour_abrv,jour_title, title = '','',''  # journal title, abbreviation and article title of the retracted paper
    pub_type = '' # publication type of the retracted paper such as letter, article etc.
    retractionOf = None
     
    try:

        # extract pmid
        if soup.PMID: # pmid
            rn_pmid = soup.PMID.string
        elif soup.ArticleIdList.find(IdType="pubmed"):
            rn_pmid = soup.ArticleIdList.find(IdType="pubmed").string
            #print(pmid)

        # extract doi
        if soup.ArticleIdList.find(IdType="doi") is not None:
            doi = soup.ArticleIdList.find(IdType="doi").string

        # fetching publication year
        if soup.ArticleDate:
            if soup.ArticleDate.Year is not None:
                year = soup.ArticleDate.Year.string

            if soup.ArticleDate.Month is not None:
                month = soup.ArticleDate.Month.string

            if soup.ArticleDate.Day is not None:
                day = soup.ArticleDate.Day.string

        elif soup.PubDate:
            if soup.PubDate.Year is not None:
                year = soup.PubDate.Year.string

            if soup.PubDate.Month is not None:
                month = soup.PubDate.Month.string

            if soup.PubDate.Day is not None:
                day = soup.PubDate.Day.string        
    
        
        if year == '9999':
            """
            <PubMedPubDate PubStatus="pubmed">
            <PubMedPubDate PubStatus="medline">
            <PubMedPubDate PubStatus="entrez">
            """

            if soup.find_all('PubMedPubDate', {'PubStatus': "pubmed"}):
                pub_date_elements = soup.find_all('PubMedPubDate', {'PubStatus': "pubmed"})
                for pub_date in pub_date_elements:
                    if pub_date.find("Year").text:
                        year = pub_date.find("Year").text
                    if pub_date.find("Month").text:
                        month = pub_date.find("Month").text
                    if pub_date.find("Day").text:
                        day = pub_date.find("Day").text
                    
            elif soup.find_all('PubMedPubDate', {'PubStatus': "medline"}):
                pub_date_elements = soup.find_all('PubMedPubDate', {'PubStatus': "medline"})
                for pub_date in pub_date_elements:
                    if pub_date.find("Year").text:
                        year = pub_date.find("Year").text
                    if pub_date.find("Month").text:
                        month = pub_date.find("Month").text
                    if pub_date.find("Day").text:
                        day = pub_date.find("Day").text

            elif soup.find_all('PubMedPubDate', {'PubStatus': "entrez"}):
                pub_date_elements = soup.find_all('PubMedPubDate', {'PubStatus': "entrez"})
                for pub_date in pub_date_elements:
                    if pub_date.find("Year").text:
                        year = pub_date.find("Year").text
                    if pub_date.find("Month").text:
                        month = pub_date.find("Month").text
                    if pub_date.find("Day").text:
                        day = pub_date.find("Day").text        
            
        pub_year = f'{year}:{month}:{day}'


        # extract title
        if soup.ArticleTitle is not None:
            title = soup.ArticleTitle.string
                #print(title)

        # extract journal title
        if soup.Title is not None:
            jour_title = soup.Title.string
            
        # extract journal title abbreviation
        if soup.ISOAbbreviation is not None:
            jour_abrv = soup.ISOAbbreviation.string


       
        #extract publication types
        if soup.PublicationTypeList is not None:
                pub_type = soup.PublicationTypeList.find_all()
                pub_type = ';'.join([pub.string for pub in pub_type])

        elif soup.PublicationType is not None:
                pub_type = check_soup.PublicationType.string


        # Checking Attribute 'RetractionOf' to see if the PMID is a retraction notice
        retraction_of = soup.find('CommentsCorrections', attrs={'RefType': 'RetractionOf'})
        #print(retraction_of)
        pmid = retraction_of.find('PMID')
        if pmid is not None:
            retractionOf = pmid.text


    except Exception as e: 
        pass
        print(f'Error at {rn_pmid} with {doi}')


    return [rn_pmid, doi,pub_year,title,pub_type,
            jour_title,jour_abrv,retractionOf]

In [None]:
"""
- Reading in PubMed file and renaming some columns
- Extract the publication year & retracted year and convert to 'int' type. 
"""
pubmed = pd.read_csv(data_dir+"/pubmed/pubmed_retractedpublication_"+getdate['pubmed']+".csv").rename(
    columns={'doi':'DOI',
            'au_names':'Author',
            'title':'Title',
            'journal_title':'Journal',
            'year':'Year',
            'pmid': 'PubMedID',
            'retraction_notice_pmid':'RetractionPubMedID',
            'rn_doi':'RetractionDOI',
            'retracted_year':'RetractionDate'}) #.drop(['Unnamed: 0'],axis=1)

pubmed['source']='PubMed'
#pubmed['Year'] = pubmed['Year'].str.split(':').str[0].astype(int)
#pubmed['RetractionDate'] = pubmed['RetractionDate'].str.split(':').str[0].fillna(0).astype(int)


pubmed['DOI']= pubmed['DOI'].str.strip().astype(str).apply(convert_unicode)
pubmed['PubMedID']= pubmed['PubMedID'].fillna(0).astype(int)\
                .replace(0,'').astype(str).str.strip()

pubmed['Year'] = pubmed['Year'].str.split(':').str[0].astype(int)

pubmed['RetractionPubMedID']= pubmed['RetractionPubMedID'].fillna(0).astype(int)\
                .replace(0,'').astype(str).str.strip()

pubmed

In [None]:
"""
Get retraction PubMedID, in which will be used to fetch retraction year
"""
retraction_notice_pmids= pubmed['RetractionPubMedID'].to_list()
no_records = 300
retraction_notice_pmids_batches=batch_items(retraction_notice_pmids,no_records) 

print(f'The retraction notice pmids are divided into {len(retraction_notice_pmids_batches)} batches')
print(f'There are {len(pubmed)} pmids divided into lists in which each list contains {no_records} records maximum')

In [None]:
"""
Run this cell to get this cell to get data from the API. 

Running the pipeline for the 'retracion_notice_pmids' and save as 
'pubmed_retractiondetail.csv'
"""

rn_header = ['retraction_notice_pmid', 'rn_doi', 'retracted_year', 'title', 'pub_type',
       'journal_title', 'journal_abrv', 'retractionOf_pmid']

outfile = open(data_dir+"pubmed/pubmed_retractiondetail.csv","w",encoding = "utf-8", newline = "")
csvout = csv.writer(outfile)
csvout.writerow(rn_header)

result_per_paper = []
count =1

for selected_pmids in tqdm(retraction_notice_pmids_batches):
    all_results= []
    print(f'batch {count}/{len(retraction_notice_pmids_batches)}: {len(selected_pmids)} records')

    retraction_notice_papers_xml = retrieve_retracted_paper_data_from_metadata(selected_pmids)
    
    
    rn_soup = bs(retraction_notice_papers_xml,'xml') 
    #print(soup)
    rn_papers_xml = rn_soup.find_all('PubmedArticle') # <PubmedArticle> 
     
    time.sleep(10)
    
    for per_paper_xml in rn_papers_xml:
        result_per_paper = extract_retraction_notice_metadata(per_paper_xml)
        #print(result_per_paper)
        all_results.append(result_per_paper)
        csvout.writerow(result_per_paper)
    
#     rn_csvout.writerows(all_results)
    count+=1

outfile.close()

In [None]:
"""
Load already-gotten retraction data from PubMed in the above cell
"""

pubmed_retraction = pd.read_csv(data_dir+f"pubmed/pubmed_retractiondetail.csv",encoding= 'utf-8').rename(    
            columns={
            'title':'Title',
            'journal_title':'Journal',
            'year':'Year',
            'pmid': 'PubMedID',
            'retraction_notice_pmid':'RetractionPubMedID', 
            'rn_doi':'RetractionDOI',
            'retracted_year':'RetractionYear'})

pubmed_retraction['RetractionYear'] = pubmed_retraction['RetractionYear'].str.split(':').str[0].astype(int)
pubmed_retraction['RetractionPubMedID']= pubmed_retraction['RetractionPubMedID'].astype(str).str.strip()

pubmed_retraction

In [None]:
pubmed_retraction_pmids = [int(x) for x in pubmed_retraction['RetractionPubMedID'].to_list()]

retraction_pmid_undone = set(retraction_notice_pmids) - set(pubmed_retraction_pmids)
len(retraction_pmid_undone)

In [None]:
"""
Save RetractionYear for PubMed

ToDo:
Uncomment ".to_csv..." to save the file
"""
# Deduplicate the pubmed_retraction
pubmed_retraction.drop_duplicates(subset=['RetractionPubMedID'], keep='first', inplace= True)

# Merge the PubMed retraction (retraction notice item) with the PubMed (retracted item)
pubmed_updated = pd.merge(pubmed,pubmed_retraction.iloc[:,:3], on='RetractionPubMedID', how='left')

# Replace RetractionYear with NaN value with zero - 0
pubmed_updated['RetractionYear']= pubmed_updated['RetractionYear'].fillna(0).astype(int)

pubmed_updated\
#     .to_csv(data_dir+'pubmed/pubmed_retractionyear_'+getdate['pubmed']+'.csv')

pubmed_updated[['PubMedID','DOI','Year','RetractionPubMedID','RetractionYear']]

In [None]:
pubmed_updated[['PubMedID','DOI','Year','RetractionPubMedID','RetractionYear']].info()

# Web of Science

In [None]:
"""
The following databases: i. BIOABS ii. BCI iii. CCC and iv. Web of Science Core of Web of Science platform 
have  patterns in item title that say:
".... (Retracted article. See vol. 122, 2021".  It will contain "Retracted article" and "See".
Hence we are extracting the retraction year from the title with pattern recognition.

"""
pass

In [None]:
def readin_wosDBs_files(df_dir: str):  
    df = pd.read_csv(df_dir).rename(
    columns={'Authors':'Author', 
             'Article Title': 'Title', 
             'Source Title': 'Journal', 
             'Publication Year': 'Year', 
             'Pubmed Id': 'PubMedID'})#.drop(['Unnamed: 0'],axis=1)


    df['Year'] = pd.to_numeric(df['Year'], errors='coerce') 
    df['Year'] = df['Year'].fillna(0)
    df['Year'] = df['Year'].astype(float).astype(int)

    df['DOI']= df['DOI'].str.strip().astype(str).apply(convert_unicode)

    #df['PubMedID'] = df['PubMedID'].fillna('').replace('nan','').str.replace('.0', '',regex=False)
    
    return df

In [None]:
def get_wos_retractionyear(title: str):
    """
    It gets retraction year from the title of the article
    
    :param title: the title of the article (input data)
    :return: retracted year
    """
    
    if title=='':
        return '0'
    
    pattern = r'(?i)Retracted.*'# r'Retracted.*?see.*?(\d{4})'  ?(\d{4})
    
    
    try:
        match = re.search(pattern, title)
        new_text= re.sub(r'\s+', ' ', match[0])
        new_text= re.sub(r'[,().]', '', new_text)
        retractionyear= new_text.split(' ')[-1]
        if len(retractionyear)==4:
            return retractionyear
        else:
            return '0'
        
    except TypeError:
        pass


def wosDBs_get_retractionyear(df: pd.DataFrame):
    """
    Extracting the retraction year from the title with pattern:
    ".... (Retracted article. See vol. 122, 2021".  It will contain "Retracted article" and "See", thereafter,
    extract the four digits that represents the retracted year
    """

    # Removing excessive whitespace from title
    df['Title']= df['Title'].fillna('')

    df['Title']= df['Title'].apply(lambda x: re.sub(r'\s+', ' ', x))


    df_updated= df.copy()

    # initialize row with no retraction year value to zero (0)
    df_updated['RetractionYear']= df_updated['Title'].apply(get_wos_retractionyear)
    
    df_updated['RetractionYear']= df_updated['RetractionYear'].fillna('0')
    
    df_updated['RetractionYear']= df_updated['RetractionYear'].astype(int)

    df_withRetractionYear = len(df_updated[df_updated['RetractionYear']!=0])

    print(f'The total number of records with retraction year is {df_withRetractionYear} items')

    return df_updated
    

In [None]:
"""
Load BCI retracted dataset and extract retraction year from the Title
"""

bci_dir = data_dir+f"webofscience/bci_retractedpublication_{getdate['bci']}.csv"
bci = readin_wosDBs_files(bci_dir)

bci_updated= wosDBs_get_retractionyear(bci)

bci_updated\
#     .to_csv(data_dir+'webofscience/bci_retractionyear_'+getdate['bci']+'.csv')
bci_updated.head()

In [None]:
"""
Load BIOABS retracted dataset and extract retraction year from the Title
"""

bioabs_dir = data_dir+f"webofscience/bioabs_retractedpublication_{getdate['bioabs']}.csv"
bioabs = readin_wosDBs_files(bioabs_dir)

bioabs_updated= wosDBs_get_retractionyear(bioabs)
bioabs_updated\
     .to_csv(data_dir+'webofscience/bioabs_retractionyear_'+getdate['bioabs']+'.csv')

bioabs_updated.head()


In [None]:
"""
Load CCC retracted dataset and extract retraction year from the Title
"""
ccc_dir = data_dir+f"webofscience/ccc_retractedpublication_{getdate['ccc']}.csv"
ccc = readin_wosDBs_files(ccc_dir)

ccc_updated= wosDBs_get_retractionyear(ccc)
ccc_updated\
#     .to_csv(data_dir+'webofscience/ccc_retractionyear_'+getdate['ccc']+'.csv')

ccc_updated.head()

In [None]:
"""
Load Web of Science Core retracted dataset and extract retraction year from the Title
"""
woscore_dir = data_dir+f"webofscience/webofsciencecore_retractedpublication_{getdate['webofsciencecore']}.csv"

woscore = readin_wosDBs_files(woscore_dir) #.drop(['Unnamed: 0'], axis=1)

woscore_updated= wosDBs_get_retractionyear(woscore)
woscore_updated\
#    .to_csv(data_dir+'webofscience/webofsciencecore_retractionyear_'+getdate['webofsciencecore']+'.csv')

woscore_updated.head()

# Crossref

In [None]:
"""
- Reading in Crossred file and extract its retracted item DOIs in which will be use to search their
  retraction year
"""

crossref = pd.read_csv(data_dir+f"crossref/crossref_recordswithdoi_{getdate['crossref']}.csv").drop('Unnamed: 0',axis=1)
crossref.head(5)

In [None]:
"""
Extracting the Crossref retracted item DOIs for RetractionYear request from its API
"""

crossref_dois = crossref[~crossref['DOI'].isna()]['DOI']
crossref_dois= list(set(crossref_dois))
print(len(crossref_dois))

# Split the extracted DOIs in 10 batches
crossref_dois_chunks = np.array_split(list(crossref_dois), 10)

In [None]:
"""
Searching for the retraction detail via Crossref API: retraction year and reason of retracted items
"""

start = time.time()

crossref_retraction= []
base_url = "https://api.crossref.org"

for crossref_doi_batch in crossref_dois_chunks[:]:

    for doi in tqdm(crossref_doi_batch):
        url = f"{base_url}/works/{doi}"
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
#             print(data)

            error_type, retractionyear= '',0
            
            try:
                if data['message']['update-to']:
                    try:
                        if data['message']['update-to'][0]['type']:
                            error_type= data['message']['update-to'][0]['type']
                    except Exception as e: 
                        print(f'error at {doi}  at index: {c_doi}. Error: {e}')
                        continue
                        
                    try:
                        if  data['message']['update-to'][0]['updated']['date-parts'][0]:
                            retractionyear=  data['message']['update-to'][0]['updated']['date-parts'][0]   
                    except Exception as e: 
                        print(f'error at {doi}  at index: {c_doi}. Error: {e}')
                        continue
                        
                    crossref_retraction.append([doi,retractionyear,error_type])
  
            except:
                pass


        else:
            pass
        time.sleep(.10)
    
                
end = time.time()
end - start

In [None]:
"""
Merger data from Crossref API and save the file

ToDo:
Uncomment (line 15):" crossref_retractionyear.to_csv..." to save the file to drive
"""
# If running in multiple batches, concatenate the batches
# crossref_retractionyear= pd.concat([cross_tempo, crossref_retraction], axis=0, ignore_index=True).rename(
#     columns={0:'DOI', 1: 'RetractionYear', 2:'Reason'})

crossref_retractionyear= pd.DataFrame(crossref_retraction).rename(
    columns={0:'DOI', 1: 'RetractionYear', 2:'Reason'})
crossref_retractionyear['RetractionYear']= crossref_retractionyear['RetractionYear'].apply(lambda x: int(x[0]))

crossref_retractionyear \
#    .to_csv(data_dir+'crossref/crossref_retractionyear_only.csv')

In [None]:
"""
Load already-gotten retraction year data from Crossref in the  above cell
"""
crossref_retractionyear= pd.read_csv(data_dir+'crossref/crossref_retractionyear_only.csv').drop(['Unnamed: 0'],axis=1)
crossref_retractionyear.head()


In [None]:
"""
Merging RetractionYear to Crossref retracted items

ToDo:
Uncomment (line 12): ".to_csv(data_dir...." to save the complete Crossref data with retraction year
"""

crossref_updated= pd.merge(crossref,crossref_retractionyear[['DOI','RetractionYear','Reason']], on='DOI', how='left')

crossref_updated['RetractionYear']= crossref_updated['RetractionYear'].fillna(0).astype(int)
crossref_updated \
#    .to_csv(data_dir+f'crossref/crossref_retractionyear_'+getdate['crossref']+'.csv')


In [None]:
crossref_updated.info()

# Retraction Watch

In [None]:
retractionwatch = pd.read_csv(data_dir+f"retractionwatch/retractionwatch_{getdate['retractionwatch']}.csv",\
                             encoding='latin1').rename(
    columns={'OriginalPaperDOI':'DOI', 
             'OriginalPaperPubMedID': 'PubMedID', 
             'OriginalPaperDate': 'Year'})

retractionwatch['PubMedID']= retractionwatch['PubMedID'].fillna(0).astype(int)\
                .replace(0,'').astype(str).str.strip()

retractionwatch['RetractionPubMedID']= retractionwatch['RetractionPubMedID'].fillna(0).astype(int)\
                .replace(0,'').astype(str)

retractionwatch['Year']=  pd.to_datetime(retractionwatch['Year'], exact=False).dt.year
retractionwatch['RetractionYear']=  pd.to_datetime(retractionwatch['RetractionDate'], exact=False).dt.strftime("%Y").fillna(0).astype(int)

retractionwatch.head()

### Update the Unionlist with RetractionYear Values from Data Sources: 
####   Retraction Watch, PubMed, BCI, BIOABS, CCC, Web of Science Core, and Crossref

In [None]:
def update_Unionlist_retractionyear(df_source: pd.DataFrame, col_name: str):
    """
    It updates the unionlist with retraction year from PubMed, BCI, BIOABS, Web of Science Core, 
    and Crossref. Retraction Watch does not use this function.
    
    :param df_source: The DataFrame to use to update unionlist with Retractionyear
    :param col_name: the column ID to use as reference i.e PubMedID/DOI
    :return: the updated unionlist with retraction year assigned
    """
    

    unionlist_updated['RetractionYear']= unionlist_updated['RetractionYear'].fillna(0).astype(int)

    for index, row in unionlist_updated.iterrows():
        doi,pmid,retractionyear= row[0].strip(), row[6].strip(), row[7]
        
        if retractionyear == 0:
            if col_name=='PubMedID': 
                if pmid in df_source['PubMedID'].values: 
                    df_source_retractionyear = list(df_source.loc[df_source['PubMedID'] == pmid, 'RetractionYear'])[0]
                    unionlist_updated.at[index, 'RetractionYear'] = df_source_retractionyear
            
            if col_name=='DOI': 
#                 print('Here')
                
                if doi in df_source['DOI'].values: 
#                     print('Y')
#                     print(doi,pmid,retractionyear)

                    df_source_retractionyear = list(df_source.loc[df_source['DOI'] == doi, 'RetractionYear'])[0]
#                     print(df_source_retractionyear,doi)
                    
                    unionlist_updated.at[index, 'RetractionYear'] = df_source_retractionyear


    without_rn_year = len(unionlist_updated[unionlist_updated['RetractionYear']==0])
  

    print(f'While the total items with no retraction year is {without_rn_year} in the unionlist')

    return unionlist_updated    

In [None]:
"""
Loading the unionlist
"""
unionlist= pd.read_csv(data_dir +'unionlist/unionlist_with_nodoi_'+getdate['unionlist']+'.csv', encoding='utf-8').drop('Unnamed: 0', axis=1)

# convert String type
unionlist['PubMedID']= unionlist['PubMedID'].fillna(0).astype(int).replace(0,'').astype(str)

unionlist

In [None]:
"""
Confirm no duplicate DOIs in unionlist
"""
len(set(unionlist['DOI']))

unionlist[unionlist['DOI'].duplicated('keep'==False)]

In [None]:
"""
First Pass:
Updating the unionlist with RetractionYear from Retraction Watch for items with no retraction year
"""

rw_deduplicated= retractionwatch.drop_duplicates(subset=['DOI'], keep='last')

unionlist_updated = pd.merge(unionlist,rw_deduplicated[['DOI','RetractionYear']], on='DOI', how='left')

with_rn_year1= len(unionlist_updated[~unionlist_updated['RetractionYear'].isna()])
without_rn_year1= len(unionlist_updated[unionlist_updated['RetractionYear'].isna()])

unionlist_updated['RetractionYear']= unionlist_updated['RetractionYear'].fillna(0.0).astype(int)

print(f'In First Pass: \nUsing Retraction Watch: the total items with retraction year is {with_rn_year1} ')

print(f'While the total items with no retraction year in the unionlist is {without_rn_year1}  ')

unionlist_updated#.head()

In [None]:
"""
Second Pass:
Updating the unionlist with RetractionYear from PubMed for items with no retraction year after first pass
"""
print(f'In Second Pass: Update the remaining items in the Unionlist with retraction year from PubMed')

update_Unionlist_retractionyear(pubmed_updated,'PubMedID') #.head()


In [None]:
"""
Third Pass:
Updating the unionlist with RetractionYear from BCI for items with no retraction year after second pass
"""
print(f'In Third Pass: Update the remaining items in the Unionlist with retraction year from BCI')

update_Unionlist_retractionyear(bci_updated,'DOI') #.head()

In [None]:
"""
Fourth Pass:
Updating the unionlist with RetractionYear from BIOABS for items with no retraction year after third pass
"""
print(f'In Fourth Pass: Update the remaining items in the Unionlist with retraction year from BIOABS')

update_Unionlist_retractionyear(bioabs_updated,'DOI') #.head()

In [None]:
"""
Fifth Pass:
Updating the unionlist with RetractionYear from CCC for items with no retraction year after fourth pass
"""
print(f'In Fifth Pass: Update the remaining items in the Unionlist with retraction year from CCC')

update_Unionlist_retractionyear(ccc_updated,'DOI') #.head()

In [None]:
"""
Sixth Pass:
Updating the unionlist with RetractionYear from Web of Science Core for items with no retraction year after fifth pass
"""
print(f'In Sixth Pass: Update the remaining items in the Unionlist with retraction year from Web of Science Core')

update_Unionlist_retractionyear(woscore_updated,'DOI') #.head()

In [None]:
"""
Seventh Pass:
Updating the unionlist with RetractionYear from Crossref for items with no retraction year after sixth pass
"""
print(f'In Seventh Pass: Update the remaining items in the Unionlist with retraction year from Crossref')

update_Unionlist_retractionyear(crossref_updated,'DOI')#.head()

In [None]:
"""
Save The update Unionlist with RetractionYear

ToDo:
Uncomment (line 8): ".to_csv(data_dir+'unionlist/...." to save items in union list with retraction year
"""
unionlist_updated\
#     .to_csv(data_dir+'unionlist/unionlist_with_retractionyear_'+getdate['unionlist']+'.csv')