### This process uses PubMedID in sources without DOI and search through Unionlist, Web of Knowledge and Scopus to update the DOI of the affected items.
- BCI, BIOABS, CCC, Medline, PubMed, Scopus, and Web of Science Core are the databases with PubMedIDs to which this approach is applicable

####  Input File:  
   - Union list of retracted publication
       - unionlist/unionlist_{date}.csv (from Step 1) <br> <br>
  
   - Retracted publication records with "recordsnodoi_" from the above sources:  
       - pubmed/pubmed_recordsnodoi_{date}.csv
       - webofscience/bci_recordsnodoi_{date}.csv
       - webofscience/bioabs_recordsnodoi_{date}.csv
       - webofscience/ccc_recordsnodoi_{date}.csv
       - webofscience/medline_recordsnodoi_{date}.csv
       - webofscience/webofsciencecore_recordsnodoi_{date}.csv
       - scopus/scopus_recordsnodoi_{date}.csv
       - retractionwatch/retractionwatch_recordsnodoi_{date}.csv
      <br><br>
   - Overveiw result from Step 1
        - result/datasources_overview.csv

#### Output File: 
   - unionlist/unionlist_with_nodoi_{date}.csv
   - unionlist/unionlist_duplicated_pmids_{date}.csv
   - overview table results



In [None]:
import re
import time,datetime

import os
import csv
import numpy as np
import unicodedata
import json

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import requests
import time,datetime

In [None]:
"""
Targeting the retraction_index_path
"""

retraction_index_path = os.path.abspath('./.')
retraction_index_path

data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'


### Configuration File

In [None]:
# Load configuration
con_file = open(retraction_index_path+"/config.json")
config = json.load(con_file)
con_file.close()

# Initializing variable for configuration file
my_email = config['my_email']
elsevier_api_key = config['Elsevier_APIKEY']
elsevier_insttoken = config['insttoken']
ieee_xplore_api_key = config['IEEEXplore_APIKEY']
wos_api_key = config['WoS_APIKEY']
ads_api_key= config['ADS']

In [None]:
def convert_unicode(string: str) -> str:
    """
    It takes a string and passes it through different encoding parameter phases
    E.g. '10.\u200b1105/\u200btpc.\u200b010357' ->  '10.1105/tpc.010357'
    
    :param string: variable to be encoded
    :return: the actual string value devoided of encoded character
    """
    
    string = unicodedata.normalize('NFKD', string).encode('iso-8859-1', 'ignore').decode('iso-8859-1')
    string = unicodedata.normalize('NFKD', string).encode('latin1', 'ignore').decode('latin1')
    string = unicodedata.normalize('NFKD', string).encode('cp1252', 'ignore').decode('cp1252')
    return string


In [None]:
def batch_pmids(pmids:list, cut:int)-> list[list]:
    """
    It divides the list pmids into batches for processing. 
    :param pmids: list of pmids 
    :param cut: maximum number of records to assign to a batch
    
    :return: list of batches of pmids
    """
    pmids_batches=[]
    
    while len(pmids) >= cut:
        selected_pmids= pmids[:cut]
        pmids_batches.append(selected_pmids)
#         print(selected_pmids)    
        pmids = pmids[cut:]

    if pmids:
        pmids_batches.append(pmids)
#         print(pmids)

    return pmids_batches

In [None]:
"""
Input the date you retrieve retraction publications for each database

Update the date for each database format: YYYY-MM-DD e.g. 2024-02-13
"""

getdate = {'scopus': '2024-07-05',
            'crossref':'2024-07-03',
            'retractionwatch': '2024-07-03',
            'pubmed': '2024-07-03',
            'geobase': '2024-07-05',
            'compendex': '2024-07-09',
                
            'bci': '2024-07-03',
            'bioabs': '2024-07-03',
            'ccc': '2024-07-03',
            'medline': '2024-07-03',
            'webofsciencecore': '2024-07-03',
           
            'unionlist':'2024-07-09'} 


In [None]:
# Read Union_list
unionlist = pd.read_csv(data_dir+f"unionlist/unionlist_{getdate['unionlist']}.csv").drop('Unnamed: 0',axis=1)
unionlist['PubMedID'] = unionlist['PubMedID'].fillna(0).astype(int).replace(0,'').astype(str)
print('The total number of retracted items is ',len(unionlist))
unionlist.head()

In [None]:
# Check for duplicates within DOI column
unionlist[unionlist['DOI'].duplicated('keep'==False)]

## Stage1: Unionlist Update: Update the 'source_name' for Items without DOI but PubMedID found in the Unionlist
-  Update Unionlist: Add ‘source_name' to Items with no DOI that have PubMedID in the Unionlist

In [None]:
"""
Note: It is important to acknowledge that while an item in a source may not have a DOI, the same item may 
have a DOI in other sources. To identify such cases, PubMedID was used. 

To modify the unionlist e.g. for PubMed:
- Identify items with no DOI in PubMed <- pubmed_nodoi
- Find items with no DOI in PubMed in the unionlist using PubMedID <- pubmed_nodoi_pmids_in_unionlist
- Add 'PubMed' to item source(s) in unionlist that shared PubMedID with pubmed_nodoi_pmids_in_unionlist

"""
pass

In [None]:
# Getting items from df that have no DOI but covered in the union_list

unionlist_tempo= unionlist.copy(deep=True) # Use for verifying update in sources

def update_sourceDOI_withPubMedID(df: pd.DataFrame, db_source: str):
    """
    It updates the source of an item in Unionlist: Items that have no DOI but have PubMedID from another source.
    When the PubMedID matches the the Unionlist the source will inherit the DOI from the Unionlist.
    
    :param df: the database that has items without DOI
    :param db_source: the name of the database that will be used in updating the Unionlist
    
    :return: df_record_updated: dataframe of items updated in the unionlist with the additional database (db_source)
    """

    df['DOI']= df['DOI'].fillna('')
    df['DOI']=df['DOI'].astype(str)
    df_nodoi = df[~df['DOI'].str.startswith('10')]
    

    df_pmids= set(df_nodoi[df_nodoi['PubMedID']!='']['PubMedID']) # get PubMedIDs in DF


    df_nodoi_pmids_in_unionlist= list(set(unionlist[unionlist['PubMedID'].isin(df_pmids)]['PubMedID']))


    
    # Count number of 'PubMed' before update
    ul_before= unionlist_tempo[unionlist_tempo['source'].str.contains(db_source)]
    before_count= len(ul_before)
    ul_before_loc = ul_before[ul_before['source'].str.contains(db_source)].index # Get loc of record before update 

    
    # Updating the source where PubMedID matches
    unionlist.loc[unionlist['PubMedID'].isin(df_nodoi_pmids_in_unionlist), 'source']+='; ' + db_source #'; PubMed'
    
    """
    Verify the update 'PubMed source Update'
    """

    # Count number of 'PubMed' after update
    ul_after= unionlist[unionlist['source'].str.contains(db_source)]

    after_count= len(ul_after)
    ul_after_loc = ul_after[ul_after['source'].str.contains(db_source)].index # Get loc of record after update
    
    
    updated_loc= list(set(ul_after_loc) - set(ul_before_loc)) # Get loc of record that got updated
    df_record_updated= unionlist.loc[updated_loc]
    
    """
    Getting instance of update
    """

    print(f'The total number of PubMedID in {db_source} that matched Unionlist is {len(df_nodoi_pmids_in_unionlist)}')

    print(f"The total number of items with '{db_source}' source before update is {before_count} ")
    print(f"The total number of items with '{db_source}' source after update is {after_count} ")
    
    print(f"The total number of items updated with '{db_source}' source in the unionlist is  {after_count - before_count} ")

    
    return df_record_updated

#### PubMed: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in PubMed
"""
   

pubmed_nodoi = pd.read_csv(data_dir+'pubmed/pubmed_recordsnodoi_'+getdate['pubmed']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
pubmed_nodoi['PubMedID']= pubmed_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the Unionlist with PubMed Source that PubMedID matches
"""
pubmed_pmids_in_unionlist= update_sourceDOI_withPubMedID(pubmed_nodoi, 'PubMed')

In [None]:
"""
Verify the update: PubMed vs. Unionlist
"""
pd.set_option('display.max_colwidth', None) # View entire cell value in Jupyter without truncation
print(pubmed_pmids_in_unionlist.iloc[0]['PubMedID'])
pubmed_nodoi[pubmed_nodoi['PubMedID'].str.contains(pubmed_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(pubmed_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### BCI: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in BCI
"""

bci_nodoi = pd.read_csv(data_dir+'webofscience/bci_recordsnodoi_'+getdate['bci']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
bci_nodoi['PubMedID']= bci_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


In [None]:
"""
Update Items in the unionlist with BCI Source that PubMedID matches
"""
bci_pmids_in_unionlist= update_sourceDOI_withPubMedID(bci_nodoi, 'BCI')

In [None]:
"""
Verify the update: BCI vs. Unionlist
"""
print(bci_pmids_in_unionlist.iloc[0]['PubMedID'])
bci_nodoi[bci_nodoi['PubMedID'].str.contains(bci_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(bci_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### BIOABS: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in BIOABS
"""

bioabs_nodoi = pd.read_csv(data_dir+'webofscience/bioabs_recordsnodoi_'+getdate['bioabs']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
bioabs_nodoi['PubMedID']= bioabs_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with BIOABS Source that PubMedID matches
"""
bioabs_pmids_in_unionlist= update_sourceDOI_withPubMedID(bioabs_nodoi, 'BIOABS')

In [None]:
"""
Verify the update: BIOABS vs. Unionlist
"""
print(bioabs_pmids_in_unionlist.iloc[0]['PubMedID'])
bioabs_nodoi[bioabs_nodoi['PubMedID'].str.contains(bioabs_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(bioabs_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### CCC: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in CCC
"""

ccc_nodoi = pd.read_csv(data_dir+'webofscience/ccc_recordsnodoi_'+getdate['ccc']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
ccc_nodoi['PubMedID']= ccc_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with CCC Source that PubMedID matches
"""
ccc_pmids_in_unionlist= update_sourceDOI_withPubMedID(ccc_nodoi, 'CCC')

In [None]:
"""
Verify the update: : CCC vs. Unionlist
"""
print(ccc_pmids_in_unionlist.iloc[0]['PubMedID'])
ccc_nodoi[ccc_nodoi['PubMedID'].str.contains(ccc_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(ccc_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### Medline: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in Medline
"""

medline_nodoi = pd.read_csv(data_dir+'webofscience/medline_recordsnodoi_'+getdate['medline']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
medline_nodoi['PubMedID']= medline_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with Medline Source that PubMedID matches
"""
medline_pmids_in_unionlist= update_sourceDOI_withPubMedID(medline_nodoi, 'Medline')

In [None]:
"""
Verify the update: : Medline vs. Unionlist
"""
print(medline_pmids_in_unionlist.iloc[0]['PubMedID'])
medline_nodoi[medline_nodoi['PubMedID'].str.contains(medline_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(medline_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### Web of Science Core: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in Web of Science Core
"""

webofsciencecore_nodoi = pd.read_csv(data_dir+'webofscience/webofsciencecore_recordsnodoi_'+getdate['webofsciencecore']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
webofsciencecore_nodoi['PubMedID']= webofsciencecore_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with Web of Science Core source that PubMedID matches
"""
webofsciencecore_pmids_in_unionlist= update_sourceDOI_withPubMedID(webofsciencecore_nodoi, 'WoS_Core')

In [None]:
"""
Verify the update: Web of Science Core vs. Unionlist
"""
print(webofsciencecore_pmids_in_unionlist.iloc[0]['PubMedID'])
webofsciencecore_nodoi[webofsciencecore_nodoi['PubMedID'].str.contains(webofsciencecore_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(webofsciencecore_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### Scopus: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in Scopus
"""   

scopus_nodoi = pd.read_csv(data_dir+'scopus/scopus_recordsnodoi_'+getdate['scopus']+'.csv')

# Remove decimal part of 'PubMedID' incase it has one
scopus_nodoi['PubMedID']= scopus_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with Scopus source that PubMedID matches
"""
scopus_pmids_in_unionlist= update_sourceDOI_withPubMedID(scopus_nodoi, 'Scopus')

In [None]:
"""
Verify the update: Scopus vs. Unionlist
"""
print(scopus_pmids_in_unionlist.iloc[0]['PubMedID'])
scopus_nodoi[scopus_nodoi['PubMedID'].str.contains(scopus_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(scopus_pmids_in_unionlist.iloc[0]['PubMedID'])]

#### Retraction Watch: Matching up Items with No DOI using PubMedID & Updating the Unionlist

In [None]:
"""
Loading items with PubMedIDs but No DOI in Retraction Watch
"""
   

retractionwatch_nodoi = pd.read_csv(data_dir+'retractionwatch/retractionwatch_recordsnodoi_'+getdate['retractionwatch']+'.csv')

# Remove decimal part of 'PubMedID' in case it has one
retractionwatch_nodoi['PubMedID']= retractionwatch_nodoi['PubMedID'].fillna('').astype(str).str.split('.').str[0]


"""
Update Items in the unionlist with Retraction Watch source that PubMedID matches
"""
retractionwatch_pmids_in_unionlist= update_sourceDOI_withPubMedID(retractionwatch_nodoi, 'Retraction Watch')

In [None]:
"""
Verify the update: Retraction Watch vs. Unionlist
"""
print(retractionwatch_pmids_in_unionlist.iloc[0]['PubMedID'])
retractionwatch_nodoi[retractionwatch_nodoi['PubMedID'].str.contains(retractionwatch_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Checking the Update in Unionlist
unionlist[unionlist['PubMedID'].str.contains(retractionwatch_pmids_in_unionlist.iloc[0]['PubMedID'])]

In [None]:
# Check for duplication of DOI 
# If a non-empty dataframe is returned, something has gone wrong
unionlist[unionlist['DOI'].duplicated('keep'==False)]

In [None]:
"""
Compendex, Crossref, and GEOBASE do not index items with PubMedIDs
"""
pass

In [None]:
"""
Total DOIs updated using PubMedID in the Unionlist
"""

with_doi_update= [len(bci_pmids_in_unionlist),len(bioabs_pmids_in_unionlist), len(ccc_pmids_in_unionlist),0,0,0,
len(medline_pmids_in_unionlist), len(pubmed_pmids_in_unionlist), len(retractionwatch_pmids_in_unionlist), 
len(scopus_pmids_in_unionlist), len(webofsciencecore_pmids_in_unionlist)]


overview2= pd.DataFrame()
 
overview2['source']= ['BCI', 'BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'Medline', 'PubMed','Retraction Watch','Scopus','Web of Science Core']

overview2['DOI_updated_withPubMedID']=with_doi_update
overview2

In [None]:
overview2['DOI_updated_withPubMedID'].sum()

## Stage2:  Match the Titles from Unionlist and the Titles from PubMed Items (without DOI)

In [None]:
def clean_title(text: str)-> str:
    """
    It removes special characters "[:.{}/-]" and words related to retraction from the text
    
    :param text: the input text 
    :return: the cleaned input text
    """
    
    remove_list= '(retracted to|retracted article|retracted|retraction to)'
    
#     print(text)
#     text= str(text.lower())
    text= text.strip()
    text= re.sub(r'[:.{}/-]', '', text)
    text= re.sub(r'^'+remove_list, '', text)
    return text


In [None]:
def compare_names(au_doi:str, au_nodoi:str)->str:
    """
    It matches items in authors in 'au_nodoi'  vs. 'au_doi'; and numbers that matched
    
    :au_doi: authors in unionlist
    :au_nodoi: authors in PubMed without DOI
    :return: numbers that match
    """
    #print(au_doi)
    count=0
    len_au= 0
    
#     if (np.isnan(au_doi)) or (np.isnan(au_nodoi)):
#         return '0'
    if (isinstance(au_doi,str)) or (au_doi != '') or (~np.isnan(au_doi)):
        if (isinstance(au_nodoi,str)) or (au_nodoi != '') or (~np.isnan(au_nodoi)):
            au_names= au_nodoi.split(';')
            len_au = len(au_names)
#             print( au_names)
            for name in au_names:
#                 print(au_names)
#                 print(name)
                surname= name.split()[-1].strip()
#                 print(surname, au_doi)
                if surname in au_doi:
#                     print(surname)
                    count+=1
        else:
            return '0'
    else:
        return '0'

    return f'{count}/{len_au}'
                        
    

In [None]:
def compare(doi_item, nodoi_item)-> bool:
    """
    It matches two items if they are the same. Our items of interest can be either venue or publication year
    
    :doi_item: first item
    :nodoi_item: second item
    :return: True (Yes,  same) or False (No, different) 
    """
    
    flag = False
    
    # Compares Venue
    if isinstance(doi_item,str) and isinstance(nodoi_item,str):
        doi_item= doi_item.strip()
        doi_item= re.sub(r'[:.{}/-]', '', doi_item)
        
        nodoi_item= nodoi_item.strip()
        nodoi_item= re.sub(r'[:.{}/-]', '', nodoi_item)
        
        if doi_item.lower() == nodoi_item.lower():
            flag = True
            
    # Compares Publication year
    if isinstance(doi_item,int) and isinstance(nodoi_item,int):
        if doi_item == nodoi_item:
            flag = True
    
    return flag
    

In [None]:
# Cater for title with NA in the formation of unionlist
unionlist[unionlist['Title'].isna()]

In [None]:
unionlist['Title']= unionlist['Title'].fillna('') 

In [None]:
# Cater for PubMed items without DOI that has title with NA
pubmed_nodoi[pubmed_nodoi['Title'].isna()]

In [None]:
pubmed_nodoi['Title']= pubmed_nodoi['Title'].fillna('')

In [None]:
unionlist.info()

In [None]:
"""
Unionlist: Clean the Title column & Filtered for items that are not in PubMed
"""

unionlist['clean_title']= unionlist['Title'].astype(str)
unionlist['clean_title']= unionlist['clean_title'].str.lower().apply(clean_title)

unionlist_filtered= unionlist[~unionlist['source'].str.contains('PubMed')].copy(deep=True)
unionlist_filtered.fillna('', inplace=True)

#unionlist_filtered['Author'].isna().sum()

In [None]:
"""
PubMed_NoDOI: Preprocessing
"""

pubmed_nodoi['clean_title']= pubmed_nodoi['Title'].copy(deep=True)
pubmed_nodoi['clean_title']= pubmed_nodoi['clean_title'].str.lower().apply(clean_title)

pubmed_nodoi['Author'].fillna('', inplace=True)


In [None]:
pubmed_nodoi[pubmed_nodoi['clean_title'].isin(unionlist_filtered['clean_title'])]

In [None]:
unionlist_filtered

In [None]:
"""
Matching items from Unionlist and PubMed (without DOI)
"""

row=''
compare_result= []
nodoi_title= pubmed_nodoi['clean_title'].tolist()

pubmed_nodoi_indexes = list(pubmed_nodoi.index)

for (idx, row) in enumerate(unionlist_filtered.iterrows()):
    idx_doi=row[0]
    au_name_doi= row[1][1]
    year_doi= int(row[1][3])
    pmid_doi= str(row[1][6])
    
    venue_doi= row[1][4]
    
    unionlist_clean_title= row[1][-1]
#     print(row)
#     break
    
#     print(unionlist_clean_title)
    # for idx2 in pubmed_nodoi_indexes:
    for idx2, title in zip(pubmed_nodoi_indexes,nodoi_title):
        idx_nodoi= idx2

        if title != '':
            ans_title= 'no'
            
            if title in unionlist_clean_title:
                ans_title= 'yes'
                
#                 print(idx,'\t', unionlist_clean_title)
#                 print(idx2,'\t',title)
                
                # Compare PubMedID
                pmid_nodoi= str(pubmed_nodoi.loc[idx2]['PubMedID'])
                ans_pmid= compare(pmid_doi, pmid_nodoi)
    
                # Compares year
                year_nodoi= int(pubmed_nodoi.loc[idx2]['Year'])
                ans_year= compare(year_doi, year_nodoi)
#                 print(ans_year)
                
                # Compares venue
                venue_nodoi= pubmed_nodoi.loc[idx2]['Journal']
                ans_venue= compare(venue_doi, venue_nodoi)
            
                au_name_nodoi= pubmed_nodoi.loc[idx2]['Author']
                
                # Compares author
#                 print(au_name_doi, type(au_name_doi))
#                 print(au_name_nodoi, type(au_name_nodoi))
                ans_author= compare_names(au_name_doi,au_name_nodoi)
#                 print(ans_author)
    
                compare_result.append({'doi_loc':idx_doi, 'nodoi_loc':idx_nodoi,
                                       'compared_pmid':ans_pmid,
                                       'compared_year': ans_year, 'compared_venue': ans_venue, 
                                       'compared_author': ans_author
                                      })

In [None]:
pd.DataFrame(compare_result)

####  Title Match: Manually examine the records of the Unionlist (doi_loc) vs PubMed with no DOI (nodoi_loc)
##### Manual inspection
Comparing result(i.e. where title matches)from Unionlist and PubMed with NoDOI 
 - Column 'doi_loc' represent Unionlist location
 - Column 'nodoi_loc' represent PubMed NoDOI location
 

In [None]:
unionlist.loc[[82126]]

In [None]:
pubmed_nodoi.loc[[221]]

In [None]:
"""
Result of manual inspection:

Unionlist vs PubMed NoDOI
unionlist.loc[[6131]] (10.1007/s10059-013-3067-1) vs. pubmed_nodoi.loc[[331]] (PMID: 11266114): Retraction notice vs. retracted item
unionlist.loc[[6132]] (10.1007/s10059-013-3068-0) vs. pubmed_nodoi.loc[[321]] (PMID: 11710530): Retraction notice vs. retracted item
unionlist.loc[[6133]] (10.1007/s10059-013-3069-z) vs. pubmed_nodoi.loc[[307]] (PMID: 12243356): Retraction notice vs. retracted item
unionlist.loc[[6134]] (10.1007/s10059-013-3151-6) vs. pubmed_nodoi.loc[[209]] (PMID: 17202861): Retraction notice vs. retracted item
unionlist.loc[[8527]] (10.1007/s12038-020-00116-4) vs. pubmed_nodoi.loc[[698]] (PMID: 32098911): Retraction notice vs. retracted item
unionlist.loc[[46106]] (10.1136/bjsports-39-11-786ret) vs. pubmed_nodoi.loc[[242]] (PMID: 16244183): Retraction notice vs. retracted item
unionlist.loc[[46107]] (10.1136/bjsports-39-5-249ret) vs. pubmed_nodoi.loc[[258]] (PMID: 15849282): Retraction notice vs. retracted item
unionlist.loc[[46110]] (10.1136/bjsports-40-5-377ret) vs. pubmed_nodoi.loc[[228]] (PMID: 16632562): Retraction notice vs. retracted item
- All of the above DOIs of retraction notices appear only in Crossref.

unionlist.loc[[45521]] (10.1126/science.288.5475.2338; PMID: 17769842, PMID not noted as retracted on PubMed) vs.
pubmed_nodoi.loc[[339]] (PMID: 10875912, noted as retracted on PubMed): 
Same article & journal published in different databases

unionlist.loc[[82126]] (10.3727/000000006783981909; PMID: 28863745, not noted as retracted on PubMed) vs.
pubmed_nodoi.loc[[221]] (PMID: 16898225, noted as retracted on PubMed):
Same article & journal published in different databases

If PMID is different but metadata are equal, nothing will be updated in the unionlist. 

No DOIs PMIDs are updated.
"""
pass

## Stage2:  Search items in PubMed (without DOI) in Web of Science

In [None]:
def wos_get_retracted_pubs_via_PubMedID(PubmedIDs: str):    
    """
    It searches a given PMID (PubmedID) in Web of Knowledge (WoK)
    
    param PubmedIDs: the PMID to search
    return: metadata of the result that matches in the WoK
    
    Gets the Web of Science Platform search results for a given query on retracted publications, via its API - pagination approach.
    Check API documentation: 
    https://developer.clarivate.com./apis/woslite
    #http://help.incites.clarivate.com/wosWebServicesLite/WebServicesLiteOverviewGroup/Introduction.html
    
    Allowable databases: 
    ARCI,BCI,BIOABS,BIOSIS,CABI,CCC,CSCD,DIIDW,DRCI,FSTA,INSPEC,KJD,MEDLINE,PPRN,RSCI,SCIELO,WOK,WOS,ZOOREC"
    :params filename: the directory filename to save the result of the search query
    """
    
    header = ['DOI', 'Uid','Title','Year','Month','Authors','Journal','Pub_type','PubMedID']

    global wos_api_key


    # Set the base URL for the WoS API
    base_url = 'https://api.clarivate.com/apis/wos-starter/v1/documents'  # Uses WoS Starter API
    
    # Set your API key
    api_key = wos_api_key

    # Set the query parameters 
    query =   PubmedIDs  #'DT="Retracted Publication"'

    page_no = 1    # Starting index of the results (page)
    total_counts =0
    total = 0
    page_size = 0
    
    

    # Set the request headers with the API key
    headers = {
    'X-ApiKey': api_key,
    'charset': 'UTF-8',
    'Encoding': 'UTF-8'
    }

    
    #####

    all_results = []

    # While loop for pagination
    while True:
        
        params = {
                'db': 'WOK', # WOK - retrieves items for all the databases
                'q': query,       
                'limit': 50, # maximum number of result on the page
                'page':page_no }

        response = requests.get(
                base_url,
                headers=headers,
                params= params) # 15869,
        


      # Check the status code of the response
    
        if response.status_code == 200:
            
            # Extract the response content as JSON
            data = response.json()
            
            count = data['metadata']['limit'] # counting number of items on a page
            publications = data['hits']
            results= wos_metadata_for_PubMedID(publications)              

            #csvout.writerows(results) # write the result to file
            
            all_results.extend(results)

            total_counts += int(len(publications)) # counting total number of items retrieved so far
            
            
            # Calculating number of pages to iterate
            total  = data['metadata']['total']
            rem = 0 if total % 50 == 0 else 1 # calcuat
            page_size = total//50 + rem
                   
            
            page_no += 1 # next paging
            if page_no > page_size:
                break
            #time.sleep(0.035)    
        else:
            # If the request was not successful, print the error message
            print(f"Request failed with status code: {response.status_code}")
            break
    print(f'The total number of items searched from Web of Science is {total_counts}')
    
    return all_results

In [None]:
def wos_metadata_for_PubMedID(publications:list)->list:
    """
    It extracts data from the output of Web of Science metadata
    :publications: result from the Web of Science API request
    :return: list of records from Web of Science
    """

    results = []
    for publication in publications:
        
        title= publication.get('title','')
        authors = publication.get('names', '')


        
        if publication['identifiers']:
            DOI= publication['identifiers'].get('doi','')
            pmid = publication['identifiers'].get('pmid','')
        else:
            DOI= 'NoDOI'

        journal, year, month = '','',''
        if publication.get('source'):
            source = publication.get('source', '')
            journal = source.get('sourceTitle', '')
            year = source.get('publishYear','')
            month = source.get('publishMonth','')
            
        results.append([DOI, pmid, title,year,month,authors,journal]) #authors_names
    return results

In [None]:
def format_pmid_wos(PubMedIDs: list)->str:
    """
    It formats the pmids into standard input format for Web of Science processing i.e. 
    "PMID = '22508774' OR 16087821 OR PMID = '16087821'"
    
    PubMedIDs: list of PMIDs
    return: Format that WoS can process e.g.  "PMID = '22508774' OR 16087821"
    """
    
    formatted_pmids= " OR ".join(f"PMID = '{pmid}'" for pmid in PubMedIDs)
    
    return formatted_pmids
    

In [None]:
def validate_record_pubmed(df, pubmed_nodoi):
    """
    It compares results from other sources with PubMed_NoDOI in terms of year, author, venue
    
    param df: the DataFrame we want to compare
    pubmed_nodoi: the DataFrame of pubmed_nodoi
    param result: the result of the comparison
    """
     
    result=[]
    
    pmids= df['PubMedID'].to_list()
    
    pmids_pubmed_nodoi= pubmed_nodoi['PubMedID'].to_list()
    
    for pmid in pmids:
#        print(pmid)
        if pmid in pmids_pubmed_nodoi:
            df_idx= df[df['PubMedID']==pmid].index[0]
            pubmed_idx= pubmed_nodoi[pubmed_nodoi['PubMedID']==pmid].index[0]

#            print(df_idx,pubmed_idx)

            ans_year= compare(int(df.loc[df_idx]['Year']), int(pubmed_nodoi.loc[pubmed_idx]['Year']))
            # Compares venue
            ans_venue= compare(df.loc[df_idx]['Journal'], pubmed_nodoi.loc[pubmed_idx]['Journal'])


            df['Author']= df['Author'].astype(str)
            ans_author= compare_names(df.loc[df_idx]['Author'], pubmed_nodoi.loc[pubmed_idx]['Author'])

            print(pmid,ans_year,ans_venue,ans_author)

            result.append([pmid,ans_year,ans_venue,ans_author])
        
    return result  

In [None]:
def update_DOI(pmids: list, replacing_df, original_df):
    """
    It updates the DOI of a given DataFrame from (original_df) with DOI from (replacing_df)

    'pmids': list of PubMedIDs to work with
    'replacing_df': DataFrame that its DOI is replacing
    'original_df': DataFrame that has no DOI - that will be replaced
    """

    for pmid in pmids:
        replacing_idx = replacing_df[replacing_df['PubMedID'] == pmid].index[0]
        original_idx = original_df[original_df['PubMedID'] == pmid].index[0]

        original_df.loc[original_idx, 'DOI'] = replacing_df.loc[replacing_idx, 'DOI']

    print(f'The total results updated is {len(pmids)}')

In [None]:
"""
Filter PubMed items with NoDOI that have been resolved updated in the Unionlist
"""
filtered_pmid_in_stage1= len(pubmed_nodoi[pubmed_nodoi['PubMedID'].isin(unionlist['PubMedID'])])

print(f'The total number of records in PubMed resolved in Stage 1 is len({filtered_pmid_in_stage1})')

pubmed_nodoi2= pubmed_nodoi[~pubmed_nodoi['PubMedID'].isin(unionlist['PubMedID'])]

print(f'The total number of records left unresolved, that need to be checked in WoK is {len(pubmed_nodoi2)}')

In [None]:
# convert  from  int64 to object to match with PubMedIDs of unionlist
pubmed_nodoi['PubMedID']= pubmed_nodoi['PubMedID'].astype(str) 

In [None]:
"""
Search for DOIs of items in PubMed without DOI in Web of Knowledge( i.e. All Databases in Web of Science)
using their PubMedID
"""

# put the PubMedIDs of PubMed items with no DOIs in batches - Search 20 items at a time for effective result

pubmed_nodoi_pmids_batch= batch_pmids(pubmed_nodoi2['PubMedID'].tolist(), 20)

all_result_from_wok=[]

for  pmids_list in pubmed_nodoi_pmids_batch:
    format_wos_pmid_input= format_pmid_wos(pmids_list)
    result_from_wos= wos_get_retracted_pubs_via_PubMedID(format_wos_pmid_input)
    
    all_result_from_wok.extend(result_from_wos)
    
print(f'The total number records searched in WoS is {len(all_result_from_wok)}')

In [None]:
"""
Filter records with DOI from the search of items from the WoK
:output: wok_pubmed_nodoi
"""

all_result_from_wok
wok_columns= ['DOI', 'PubMedID', 'Title','Year','Month','Author','Journal']

wok_pubmed_nodoi_= pd.DataFrame(all_result_from_wok,columns=wok_columns)

wok_pubmed_nodoi= wok_pubmed_nodoi_[wok_pubmed_nodoi_['DOI']!='']

print(f'The total records with DOI from Web of Science is {len(wok_pubmed_nodoi)}')
wok_pubmed_nodoi

In [None]:
# Cleaning the Title
wok_pubmed_nodoi.loc[:,'clean_title']= wok_pubmed_nodoi.loc[:,'Title'].str.lower().apply(clean_title)

"""
Cross Matching: Result from WOK (Web of Science - all databases) with the PubMed NoDOI
"""
pubmednodoi_in_wok_check= pd.DataFrame(validate_record_pubmed(wok_pubmed_nodoi,pubmed_nodoi2), columns=['pmid','year_check','venue_check','author_check'])
pubmednodoi_in_wok_check

####  WoK Match: Manually examine the records of WoK result vs PubMed with no DOI

In [None]:
# Below is an example of a prior instance where WoK and PubMed no DOI matched. 
# In the current run, there were no matches.

# Prior example
pubmed_nodoi[pubmed_nodoi['PubMedID']=='11032235']

In [None]:
# Prior example
wok_pubmed_nodoi[wok_pubmed_nodoi['PubMedID']== '11032235']

In [None]:
# Prior example
"""
Result of manual inspection of PubMedIDs (WoK vs PubMed:):  
3819650,7513776,7706332, 9057630, 17373355, 30074316 and 22059288

Investigated PubMedIDs in PubMed_NoDOI search in Web of Knowledge (that returned DOI)

PubMedID: 11032235 - having same title, venue*, authors (matched all through) - * one venue is in abbreviation
PubMedID: 22059288 - matched all through

Hence all DOIs: 11032235 and 22059288 will be cross-match Unionlist before 
updating in PubMed_NoDOI if no inconstitency is detected

"""
pass

####  WoK_Unionlist Match: Match PubMed_NoDOI PubMedID  that matches WoK, with Unionlist

In [None]:
# Prior example
"""
Cross Matching 2 with Unionlist: Result from WOK (Web of Science - all databases) match the PubMed NoDOI. Now
match if the DOI exist with Unionlist and, if yes, cross-match with the items of the Unionlist in order to avoid
inconsistency in Unionlist

I.e. compare matched_pubmednodoi_in_wok vs unionlist
"""
pass

In [None]:
def validate_withUnionlist(df_):
    """
    It checks result from items from a given dataframe (df_) and compares with the Unionlist 
    
    param df_: The dataframe that will be compare with the Unionlist
    return: result of the comparison with the following columns:
            DOI	Source_PubMedID	Ulist_PubMedID	pmid_check	year_check	venue_check	author_check
    
    """
    import re
    
    result=[]
    df= df_.copy()
    
    count=0


    doi_list= df['DOI'].tolist()
    unionlist_doi= unionlist['DOI'].tolist()
    for doi in doi_list:
#         print(count)
#         print(doi)

        if doi in unionlist_doi:
            df_idx= df[df['DOI']==doi].index[0]
            unionlist_doi_idx= unionlist[unionlist['DOI']==doi].index[0]

            ans_year= compare(int(df.loc[df_idx]['Year']), int(unionlist.loc[unionlist_doi_idx]['Year']))
            
       

            ans_venue= compare(df.loc[df_idx]['Journal'], unionlist.loc[unionlist_doi_idx]['Journal'])

            ans_pmid = compare(int(df.loc[df_idx]['PubMedID']), int(unionlist.loc[unionlist_doi_idx]['PubMedID']))
            
            pmid=df.loc[df_idx]['PubMedID']
            upmid= unionlist.loc[unionlist_doi_idx]['PubMedID']

            df['Author']= df['Author'].astype(str)
            ans_author= compare_names(str(df.loc[df_idx]['Author']), str(unionlist.loc[unionlist_doi_idx]['Author']))

            result.append([doi, pmid,upmid,ans_pmid,ans_year,ans_venue,ans_author])
            
            DF_result= pd.DataFrame(result,columns=['DOI','Source_PubMedID','Ulist_PubMedID','pmid_check','year_check','venue_check','author_check'])

            count+=1
    return DF_result
        
        

In [None]:
# Below is an example of a prior instance where WoK and PubMed no DOI matched.
# In the current run, there were no matches between WoK and PubMed no DOI, meaning there are no matches with the Unionlist.
# An UnboundedLocalError is returned when running this cell for this instance because there are no DOIs in the variable
# doi_list, meaning the return variable DF_result is never actually created.
"""
Check DOIs in 'matched_pubmednodoi_in_wok' that violate the Unionlist
"""
    
matched_pubmednodoi_in_wok= wok_pubmed_nodoi[wok_pubmed_nodoi['PubMedID'].isin(pubmednodoi_in_wok_check['pmid'])].copy()

wok_Ulist_validate= validate_withUnionlist(matched_pubmednodoi_in_wok)
wok_Ulist_validate



In [None]:
# Prior example
"""
Result of manual inspection of PubMedIDs of PubMed_NoDOIs that matched WoK , now compared with the Unionlist 
using DOI, PMID, Year, Venue, & Author

'10.1177/03635465000280052101' : inconsistent
'10.1177/03635465000280052101' ['28064536']: consistent
"""
pass

In [None]:
# Prior example
"""
Obtain the consistent PubMedID in PubMed_NoDOI
"""
consistent_DOIin_wok= set(matched_pubmednodoi_in_wok['DOI']) - set(wok_Ulist_validate['DOI'])


consistent_PMIDinWokDOI = wok_pubmed_nodoi[wok_pubmed_nodoi['DOI'].isin(consistent_DOIin_wok)]['PubMedID']
print(f"The PubMedIDs that in WoK that their DOIs do not mismatch (violate) with the Unionlist are below:\n{list(consistent_PMIDinWokDOI)} ")

wok_pubmed_nodoi[wok_pubmed_nodoi['DOI'].isin(consistent_DOIin_wok)]

In [None]:
# Prior example
"""
Updating DOI for PubMedIDs: ['22059288'] 
"""
#pubmednodoi_in_wok['pmid'].to_list()

update_DOI(consistent_PMIDinWokDOI, wok_pubmed_nodoi,pubmed_nodoi2)

In [None]:
# Prior example
# Confirm DOIs for consistent_DOIin_wok are not in Unionlist
unionlist[unionlist['DOI'].isin(consistent_DOIin_wok)]

In [None]:
# Prior example
# Confirm the update and save to file
pubmed_nodoi2[pubmed_nodoi2['PubMedID'].isin(consistent_PMIDinWokDOI)]\
#     .to_csv(data_dir+'pubmed/consistent_pubmedid_foundedwithdoi_inWoK.csv')

## Stage3:  Search items' Title in PubMed (without DOI) in Scopus

In [None]:
def get_scopus_metadata(publications:list)->list:
    """
    It extracts data from the Scopus API results
    
    :params publications: are json file of publications - the result from the Scopus API search
    :return: list of extracted metadata of publications
    """
    results = []
    for publication in publications:
        title = publication.get('dc:title','')
        eid = publication.get('eid','')
        DOI = publication.get('prism:doi','')
        publication_date = publication.get('prism:coverDate','')
        # author = publication.get('dc:creator', [])
        authors_raw = publication.get('author','')
        authors= [",".join([author.get('authname') for author in authors_raw])]

        affiliations= publication.get('affiliation','')
        journal = publication.get('prism:publicationName',['']) 
        PubMedID = publication.get('pubmed-id','')
        pub_type = publication.get('prism:aggregationType','')
        pub_type2 = publication.get('subtypeDescription','')
        
        results.append([DOI, eid,title,publication_date,authors,affiliations,journal,PubMedID, pub_type, pub_type2])

    return results


In [None]:
def scopus_record_via_title(title: str):
        
    """
    Gets the Scopus search results for a given query on retracted publications, using cursor - pagination approach.
    It writes the result into a specified directory file.
    Check API documentation: https://dev.elsevier.com/documentation/ScopusSearchAPI.wadl
    
    :param title: title of item to be searched in Scopus
    :param filename: the directory filename to save the result of the search query, optional if desired
    """
    

    header = ['DOI', 'Eid','Title','Publication_date','Authors','Affiliations',
          'Journal','PubMedID', 'Pub_type', 'Pub_type2']

#     outfile = open(filename,"a",encoding = "utf-8", newline = "")
    
#     csvout = csv.writer(outfile)
#     csvout.writerow(header)

    global elsevier_api_key
    global elsevier_insttoken

    # Set the base URL for the Scopus API
    base_url = "https://api.elsevier.com/content/search/scopus" 

    # Set your API key
    api_key = elsevier_api_key
    elsevier_insttoken = elsevier_insttoken

    # Set the query parameters 
#     query = "DOCTYPE(tb)" #"RETRACTED"
    query= f"TITLE({title})"


    start = 0    # Starting index of the results (page)
    total_counts =0
    total = 0
    
    

    # Set the request headers with the API key
    headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json',
        'X-ELS-Insttoken':elsevier_insttoken
    }


    
    #####
    cursor = "*"
    result_per_page =''
    all_results = []

    # While loop for pagination
    while True:

        response = requests.get(
                base_url,
                headers=headers,
                params={
                    'query': query,
                    'start': start,
#                     'date': date,    # in this format: 2000-2015
                    'cursor': cursor,
                    'view': 'COMPLETE',
                                        })

#         print(response.status_code)

      # Check the status code of the response
        if response.status_code == 200:
            
            result_per_page = response.json()
            
#             print(result_per_page)
           

            total = result_per_page['search-results']['opensearch:totalResults']
            page_count = result_per_page['search-results']['opensearch:itemsPerPage'] #'25' str maximum items on a page 
            
            publications = result_per_page['search-results']['entry']

            results = get_scopus_metadata(publications)


            #csvout.writerows(results) # write the result to file

            total_counts += int(page_count)
            cursor = result_per_page['search-results']['cursor']['@next'] #start += limit  # pagination: initializing page to the next page
            
            all_results.extend(results)
            
            
            if total_counts >= int(total):
                break
        else:
            break

    #outfile.close() 
    
    return all_results # You can also return the result 

In [None]:
"""
Search titles of items PubMed_NoDOI in Scopus
Filter records with DOI from the search of items from Scopus
:output: all_result_from_scopus
"""
titles= pubmed_nodoi2['clean_title'].to_list()

all_result_from_scopus= []
for title in titles:
    all_result_from_scopus.extend(scopus_record_via_title(title)) # Searching Scopus API
    time.sleep(.005)
# This can take a number of minutes to process.

In [None]:
len(all_result_from_scopus)

In [None]:
# Check for any instances where DOI does not exist and PMID exists in Pubmed_NoDOI
pubmed_nodoi2[(pubmed_nodoi2['DOI'].isna())&(~pubmed_nodoi2['PubMedID'].isna())]

In [None]:
# Fetching PubMedIDs of Pubmed_Nodoi 

pubmed_nodoi_pmids2=  pubmed_nodoi2[pubmed_nodoi2['DOI']=='']['PubMedID']

pubmed_nodoi_pmids2
print(f'The total number of records left unresolved after Stage 2, that need to be checked in Scopus is {len(pubmed_nodoi_pmids2)}')

In [None]:
# Return so that full column is not shown.
pd.set_option('display.max_colwidth', 50)

In [None]:
"""
Looking up for the PubMedIDs from 'all_result_from_scopus' Scopus that match PubMedIDs in the PubMed NoDOI 

output: scopus_pubmed_nodoi
"""

scopus_header = ['DOI', 'Eid','Title','Publication_date','Author','Affiliations',
          'Journal','PubMedID', 'Pub_type', 'Pub_type2']

scopus_pubmed_nodoi_temp= pd.DataFrame(all_result_from_scopus,columns=scopus_header)

scopus_pubmed_nodoi_temp['DOI']= scopus_pubmed_nodoi_temp['DOI'].fillna('')


# Extract Year
scopus_pubmed_nodoi_temp['Publication_date'] = pd.to_datetime(scopus_pubmed_nodoi_temp['Publication_date'])
scopus_pubmed_nodoi_temp['Year'] = scopus_pubmed_nodoi_temp['Publication_date'].dt.year


scopus_pubmed_nodoi_temp['Year']= scopus_pubmed_nodoi_temp['Year'].fillna(0)
scopus_pubmed_nodoi_temp['Year']= scopus_pubmed_nodoi_temp['Year'].astype(int)

scopus_pubmed_nodoi_temp= scopus_pubmed_nodoi_temp[scopus_pubmed_nodoi_temp['DOI']!='']

scopus_pubmed_nodoi_temp

scopus_pubmed_nodoi= scopus_pubmed_nodoi_temp[scopus_pubmed_nodoi_temp['PubMedID'].isin(pubmed_nodoi_pmids2)]
scopus_pubmed_nodoi

In [None]:
"""
Cross Matching: Result from Scopus with the PubMed NoDOI
Using Author,Venue, Year of publication verification for the matched PubMedIDs i.e. scopus_pubmed_nodoi
"""
pubmednodoi_in_scopus_check= pd.DataFrame(validate_record_pubmed(scopus_pubmed_nodoi, pubmed_nodoi2), columns=['pmid','year_check','venue_check','author_check'])
pubmednodoi_in_scopus_check

####  Scopus Match: Manually examine the records of Scopus result vs PubMed with no PubMedID

In [None]:
# Return columns to view all information.
pd.set_option('display.max_colwidth', None)

In [None]:
pubmed_nodoi2[pubmed_nodoi2['PubMedID']=='28925478']

In [None]:
scopus_pubmed_nodoi[scopus_pubmed_nodoi['PubMedID']=='28925478']

In [None]:
# Check PubMedIDs in Pubmed_Nodoi that matched from Scopus result
pubmed_nodoi2[pubmed_nodoi2['PubMedID'].isin(scopus_pubmed_nodoi['PubMedID'])]

In [None]:
"""
Result of manual inspection:

PMID 2480486: all tallies
PMID 7595293: all tallies
PMID 1551544: all tallies
PMID 1669382: all tallies, venue false because of '&' and 'and' difference
PMID 2025077: all tallies
PMID 19395854: all tallies, venue false because PubMed included location in venue (i.e. Cell cycle (Georgetown, Tex.)) and Scopus did not (i.e. Cell Cycle).
PMID 17086017: all tallies
PMID 16898225: all tallies
PMID 11978302: all tallies, authors do not match because PubMed has unknown and Scopus has a committee name (i.e. 'Committee on Practice Bulletins-Gynecology')
PMID 32338687: all tallies
PMID 28925478: all tallies

All the result tallied.

Hence all DOIs  will be cross-matched against Unionlist before updating in PubMed_NoDOI if no inconstitency is detected

"""
pass

####  Scopus_Unionlist Match: Match PubMed_NoDOI PubMedID  that matches Scopus, with Unionlist

In [None]:
# Return so that full column is not shown.
pd.set_option('display.max_colwidth', 50)

In [None]:
"""
Cross Matching 2 with Unionlist: Result from Scopus match the PubMed NoDOI. Now
match if the DOI exist with Unionlist and, if yes, cross-match with the items of the Unionlist in order to avoid
inconsistency in Unionlist

I.e compare matched_pubmednodoi_in_scopus vs. unionlist
"""
pass

In [None]:
"""
Check DOIs in 'matched_pubmednodoi_in_wok' that violate the Unionlist
"""
    
matched_pubmednodoi_in_scopus= scopus_pubmed_nodoi[scopus_pubmed_nodoi['PubMedID'].isin(pubmednodoi_in_scopus_check['pmid'])].copy()
# matched_pubmednodoi_in_scopus

scopus_Ulist_validate= validate_withUnionlist(matched_pubmednodoi_in_scopus)
scopus_Ulist_validate

In [None]:
"""
Obtain the consistent PubMedID in PubMed_NoDOI
"""
consistent_DOIin_scopus= set(matched_pubmednodoi_in_scopus['DOI']) - set(scopus_Ulist_validate['DOI'])


consistent_PMIDinScopusDOI = scopus_pubmed_nodoi[scopus_pubmed_nodoi['DOI'].isin(consistent_DOIin_scopus)]['PubMedID']
print(f"The PubMedIDs in Scopus whose DOIs do not mismatch (violate) with the Unionlist are below:\n{list(consistent_PMIDinScopusDOI)} ")

scopus_pubmed_nodoi[scopus_pubmed_nodoi['DOI'].isin(consistent_DOIin_scopus)]

In [None]:
# Confirm DOIs for consistent_DOIin_scopus are not in Unionlist
unionlist[unionlist['DOI'].isin(consistent_DOIin_scopus)]

In [None]:
"""
Updating PubMedID of PubMed_NoDOI that their DOIs in Scopus are in Consistent DOI in the Unionlist: 
['2480486', '7595293', '1551544', '1669382', '19395854', '11978302', '32338687', '28925478'] 
"""

update_DOI(consistent_PMIDinScopusDOI , scopus_pubmed_nodoi,pubmed_nodoi2)

In [None]:
# Confirm the DOI update
pubmed_nodoi2[pubmed_nodoi2['PubMedID'].isin(consistent_PMIDinScopusDOI)]\
#                    .to_csv(data_dir+'pubmed/consistent_pubmedid_foundedwithdoi_inScopus.csv')

## Stage4:  Update the DOI of Items that are left with no DOI

In [None]:
pubmed_nodoi2[(pubmed_nodoi2['DOI']!='')]

In [None]:
"""
Update records with no DOI with DOI format {noDOI_#}
"""

total_updated= pubmed_nodoi2[(pubmed_nodoi2['DOI']!='')]
total_left_updated= pubmed_nodoi2[~(pubmed_nodoi2['DOI']!='')]

print(f'The total DOIs updated is {len(total_updated)}')
print(f'The total DOIs left is {len(total_left_updated)}')

for (idx, row) in enumerate(total_left_updated.iterrows()):
    indexed_loc=row[0]
    label = f'noDOI_{idx}'
    #print(label)
    pubmed_nodoi2.loc[indexed_loc, 'DOI']=label
    
#     print(f'noDOI_{i}') 

In [None]:
total_updated_after_assignment= pubmed_nodoi2[(pubmed_nodoi2['DOI']!='')]
total_left_updated_after_assignment = pubmed_nodoi2[~(pubmed_nodoi2['DOI']!='')]
print(f'The total DOIs updated is {len(total_updated_after_assignment)}')
print(f'The total DOIs left is {len(total_left_updated_after_assignment)}')

In [None]:
# Save the All records update for PubMed_NoDOI
# pubmed_nodoi2.to_csv(data_dir+'pubmed/pubmed_nodoi_resolved.csv')

### Update Unionlist: Combined the Unionlist and the Updated PubMed NoDOI list

In [None]:
# Getting columns in the Unionlist
selected_columns= unionlist.columns

#copying pubmed_nodoi2 to pubmed2
pubmed2= pubmed_nodoi2.copy(deep=True)

# Renaming columns of PubMed_NoDOI to match Unionlist
pubmed2.rename(
    columns={'doi':'DOI',
            'au_names':'Author',
            'title':'Title',
            'journal_title':'Journal',
            'year':'Year',
            'pmid': 'PubMedID',
            'retraction_notice_pmid':'RetractionPubMedID',
            'rn_doi':'RetractionDOI',
            'retracted_year':'RetractionDate'}, inplace=True)

pubmed2= pubmed2[selected_columns]

In [None]:
unionlist_updated= pd.concat([unionlist,pubmed2]).drop(['clean_title'],axis=1)

# Update all the DOI to lowercase
unionlist_updated['DOI']= unionlist_updated['DOI'].str.lower()

# Save updated unionlist to File
# unionlist_updated.to_csv(data_dir+f"unionlist/unionlist_with_nodoi_{getdate['unionlist']}.csv")
unionlist_updated

In [None]:
# unionlist_updated= pd.read_csv(data_dir+f"unionlist/unionlist_with_nodoi_{getdate['unionlist']}.csv").drop(['Unnamed: 0'],axis=1)

In [None]:
# Confirming no duplicate
unionlist_updated[unionlist_updated['DOI'].duplicated('Keep'==False)]

In [None]:
total_uniqueDOI= unionlist_updated[unionlist_updated['DOI'].str.startswith('10')].count()[0]
total_uniquePMID= unionlist_updated['PubMedID'].nunique() #.str.startswith('10')].count()[0]

print(f'The total items\' unique DOIs is {total_uniqueDOI} and unique PMIDs is {total_uniquePMID}' )

In [None]:
def count_DOI_n_PubMedID(df, source)-> list:
    """
    :param df: DataFrame to work on
    :param source: source to lookup to determine number of count
    
    :return: source, # DOI, # PubMedID, # Duplicated record -> list
    
    """
    
    
    df_DOI= df[(df['DOI'].str.startswith('10')) & (df['source'].str.contains(source))]
    
    df_nodupDOI= df_DOI.drop_duplicates(subset=['DOI'], keep='first') # DF that has no duplicated DOI
    
    df_duplicatedDOI= df_DOI[df_DOI.duplicated(subset=['DOI'],keep='last')] # DF that are duplicated
    
    
    df_noDOI= df[~(df['DOI'].str.startswith('10')) & (df['source'].str.contains(source))] # DF that has no DOI
    
    
    nDOI= len(df_nodupDOI) # Numbers of items with unique DOI
    nDuplicatedDOI= len(df_duplicatedDOI) # Numbers of items that has duplicated DOI removed
    nNoDOI= len(df_noDOI) # Numbers of items with without DOI
    
    Total= len(df_DOI)
    
    
    
    if 'PubMedID' in df.columns:
        
        df_PMID=  df_DOI[((df_DOI['PubMedID'] != "") | ~df_DOI['PubMedID'].isna()) & (df_DOI['source'].str.contains(source))]
        
        df_nodupPMID= df_PMID.drop_duplicates(subset=['PubMedID'], keep='first') # DF that has no duplicated PMID
        
        df_duplicatedPMID= df_PMID[df_PMID.duplicated(subset=['PubMedID'],keep='last')] # DF that are duplicated
        
        df_noPMID= df_DOI[~(((df_DOI['PubMedID'] != "") | ~df_DOI['PubMedID'].isna()) & (df_DOI['source'].str.contains(source)))]
        
        
        nPMID= len(df_nodupPMID) # Numbers of items with unique PMID
        nDuplicatedPMID= len(df_duplicatedPMID)  # Numbers of items that has duplicated PMID
        nNoPMID= len(df_noPMID)  # Numbers of items without PMID
        
        
        
    else:
        nPMID,nDuplicatedPMID,nNoPMID= 0,0,0
        
        
    if source == 'PubMed':
        nPMID,nDuplicatedPMID,nNoPMID= 0,0,0
        
        df_PMID= df[((df['PubMedID'] != "") | ~df['PubMedID'].isna()) & (df['source'].str.contains(source))]
        df_nodupPMID= df_PMID.drop_duplicates(subset=['PubMedID'], keep='first')
        df_duplicatedPMID= df_PMID[df_PMID.duplicated(subset=['PubMedID'],keep='last')] # DF that are duplicated
#         df_noPMID= df[~(((df['PubMedID'] != "") | ~df['PubMedID'].isna()) & (df['source'].str.contains(source)))]
        
        nPMID= len(df_nodupPMID) # Numbers of items with unique PMID
        nDuplicatedPMID= len(df_duplicatedPMID)  # Numbers of items that has duplicated PMID
        
#         nNoPMID= len(df_noPMID)  # Numbers of items with without PMID
#         print(source, nPMID)
       
        Total= len(df_PMID)
        
    
        
    return  source,Total, nDOI,nNoDOI,nDuplicatedDOI,nPMID,nNoPMID,nDuplicatedPMID

In [None]:
# Recall DOI updated in Stage 1 to Stage 3

print("Total DOI updated in stage 2:", 0) #len(consistent_PMIDinWokDOI) )

print("Total DOI updated in stage 3:", len(consistent_PMIDinScopusDOI) )

print("Total DOI updated in stage 1:")
overview2

In [None]:
"""
Calculating number of DOIs/PubMedID in Unionlist per source
"""

nBCI= count_DOI_n_PubMedID(unionlist_updated,'BCI')
nBIOADS= count_DOI_n_PubMedID(unionlist_updated,'BIOABS')
nCCC= count_DOI_n_PubMedID(unionlist_updated,'CCC')
nCompendex= count_DOI_n_PubMedID(unionlist_updated,'Compendex')
nCrossref= count_DOI_n_PubMedID(unionlist_updated,'Crossref')
nGeobase= count_DOI_n_PubMedID(unionlist_updated,'GEOBASE')
nMedline= count_DOI_n_PubMedID(unionlist_updated,'Medline')
nPubMed=count_DOI_n_PubMedID(unionlist_updated,'PubMed')
nRW= count_DOI_n_PubMedID(unionlist_updated,'Retraction Watch')
nScopus= count_DOI_n_PubMedID(unionlist_updated,'Scopus')
nWoS=count_DOI_n_PubMedID(unionlist_updated,'WoS_Core')

In [None]:
#source,Total, nDOI,nNoDOI,nDuplicatedDOI,nPMID,nNoPMID,nDuplicatedPMID
nScopus

In [None]:
#source,Total, nDOI,nNoDOI,nDuplicatedDOI,nPMID,nNoPMID,nDuplicatedPMID
nPubMed

In [None]:
"""
Get previous overview before DOI resolution in Step 1  Notebook
"""
overview = pd.read_csv(result_dir+'datasources_overview.csv').drop(['Unnamed: 0'],axis=1)

overview['Records_withDOI']= overview['Records_withDOI'].astype(int)

overview

In [None]:
initialDOIs = [10267, 10144, 3471, 19266, 30918, 1039, 21737, 22058, 45419, 30878, 31683]
overview.info()

In [None]:
"""
Aggregate the items from all the sources
"""
dbtable = [] # A nested list which stores the records of each group in each source
ovtable = [] # Store the count of each group from each source and create a table for viewing

dblist= [nBCI, nBIOADS, nCCC, nCompendex, nCrossref,nGeobase, nMedline, nPubMed, nRW, nScopus, nWoS]

#Query results retrieved	Records with DOI	Records without DOI removed	Duplicate records removed
# source,Total, nDOI,nNoDOI,nDuplicatedDOI,nPMID,nNoPMID,nDuplicatedPMID
for result in dblist:
    dbtable.append(result)
    
    np_results= np.array(dbtable)
    
# Create a table showing the count of each group
overview3 = pd.DataFrame(np_results[:,[1,2,5]])

overview3.columns =["Total_records_DOI_PubMedID","Updated_total_records_withDOI_in_Unionlist", 'Records_withPubMedID_in_Unionlist',] #'DuplicatePubMedID_in_Unionlist'


overview3["Initial_records_withDOI"]= overview['Records_withDOI']

overview3['Source']= ['BCI', 'BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'Medline', 'PubMed','Retraction Watch','Scopus','Web of Science Core']


overview3['Updated_total_records_withDOI_in_Unionlist'] = overview3['Updated_total_records_withDOI_in_Unionlist'].astype(int)

# Copy row of record of number DOI updated using PubMedID
overview3['Records_updated_withDOI_in_Unionlist']= overview3['Updated_total_records_withDOI_in_Unionlist'] - overview['Records_withDOI']

overview3['Records_updated_withDOI_in_Unionlist']= overview3['Records_updated_withDOI_in_Unionlist'].astype(int)

# Re-order column
overview3 = overview3[['Source',"Total_records_DOI_PubMedID","Initial_records_withDOI",'Records_updated_withDOI_in_Unionlist',"Updated_total_records_withDOI_in_Unionlist", 'Records_withPubMedID_in_Unionlist',]] #'DuplicatePubMedID_in_Unionlist'


overview3

In [None]:
# Aggregating items in each column

overview3.loc[len(overview3)] = ['Total',
                                 overview3.Total_records_DOI_PubMedID.astype(int).sum(),
                                 overview3.Initial_records_withDOI.sum(),
                                 overview3.Records_updated_withDOI_in_Unionlist.sum(),
                                 overview3.Updated_total_records_withDOI_in_Unionlist.astype(int).sum(),
                                 overview3.Records_withPubMedID_in_Unionlist.astype(int).sum(),]

overview3


In [None]:
overview3.info()

In [None]:
# Saving updated overview to file
# overview3.to_csv(result_dir+'datasources_updated_overview.csv')

In [None]:
"""
Note  that "Records_updated_withDOI_in_Unionlist" does not reflect the true DOIs updated for each source 
because of update at each source - updated record may overlap in other sources
"""
pass

### Investigate Duplicated PubMedIDs

In [None]:
"""
Viewing items with duplicated PubMedIDs in the Updated Unionlist for manual analysis, could use for 
investigating irregularities in PubMedID indexing.
"""
is_not_null = unionlist_updated[unionlist_updated['PubMedID'] != '']
filtered_data = is_not_null[is_not_null.duplicated(subset=['PubMedID'], keep=False)].reset_index(drop=True)

# filtered_data.to_csv(data_dir+f"unionlist/unionlist_duplicated_pmids_{getdate['unionlist']}.csv")

filtered_data




In [None]:
unionlist[unionlist['DOI'].duplicated('keep'==False)]

In [None]:
pubmed_nodoi[pubmed_nodoi['PubMedID'].duplicated('keep'==False)]

In [None]:
pubmed_nodoi2[pubmed_nodoi2['DOI'].str.startswith('10')]