# Step 2: Collect Items that are Not Indexed as Retracted in the Sources

Publications on the Known Retraction List were found in 1-4 sources from step 1 (Crossref, Retraction Watch, Scopus, WOS). For publications found in fewer than 4 sources, we need to distinguish two situations: (1) the publication is not covered by a given source (e.g., no record of the publication exists in that source); and (2) the publication is covered but not listed as retracted in a given source (e.g., record exists but is not retracted in that source). To do this we use the Crossref, Scopus, and WOS APIs (no need for Retraction Watch because everything they cover is retracted) to find out whether a record exists in a source. We export the list of items covered but not indexed as retracted as the output of the code notebook. 

We split each function into three parts:
- Part 1: find retractions that are not covered in each source (Crossref, Scopus, WOS) 
- Part 2: test individual APIs using a small set of dataset from each source
- Part 3: run individual APIs to identify if the records exist in each sources based on their DOIs
- Part 4: return and export the records that exist in each source

In [None]:
!pip install pandas
!pip install elsapy
!pip install crossrefapi

In [None]:
import pandas as pd
import requests
import urllib
import json

In [None]:
from timeit import default_timer as timer
from datetime import date, timedelta

from crossref.restful import Works, Etiquette
my_etiquette = Etiquette('My Project Name', 'My Project version', 'My Project URL', 'My contact email')
works = Works(etiquette=my_etiquette)

In [None]:
today = str(date.today())

In [None]:
# Set path --- Link to the box folder with your name
# Download Box Desktop to copy the pathname

# Input
# Folder name: step1-outputfile
box_path_1 = {enterdirectorytofolder}

# Output
# Folder name: step2-outputfile
box_path_2 = {enterdirectorytofolder}

In [None]:
# Input File: One CSV file of the known retraction list 
knownretraction = pd.read_csv(box_path_1 + '2023-04-05-knownretractionlist.csv').drop(['Unnamed: 0'], axis=1).sort_values('Year', ascending=False).reset_index(drop=True)

print(knownretraction.shape())
print(knownretraction.info())
knownretraction.head()

In [None]:
# Part 1
# find retractions that are not covered in each source (Crossref, Scopus, WOS)

sp = knownretraction[~knownretraction['source'].str.contains('Scopus', na=False)]
doilist_sp = sp['DOI']

wos = knownretraction[~knownretraction['source'].str.contains('Web of Science', na=False)]
doilist_wos = wos['DOI']

cr = knownretraction[~knownretraction['source'].str.contains('Crossref', na=False)]
doilist_cr = cr['DOI']

### Crossref

In [None]:
# Part 2
# test APIs using a small set of dataset 

new = Works()

testing_start_sp = timer()

for i in doilist_cr[0:10]:
    try:
        
        for j in new.filter(doi = i).select('DOI'):
            find = j['DOI']
            if i == find:
                print(i)

    except:
        print('error')

testing_end_sp = timer()
print(testing_end_sp - testing_start_sp)

In [None]:
# Part 3 -- do not run
# run APIs to find out whether a record exists in a source

included_cr=[]

start_cr = timer()


for i in doilist_cr:
    try:
        for j in new.filter(doi = i).select('DOI'):
            find = j['DOI']
            if i == find:
                included_cr.append(i)

    except:
        print(i)

        
end_cr = timer()
print(end_cr - start_cr)

### Scopus

In [None]:
# Part 2
# test APIs using a small set of dataset

testing_start_sp = timer()


for i in doilist_sp[0:10]:
    try:
        resp = requests.get("https://api.elsevier.com/content/abstract/doi/" + i + "?field=doi",
                            headers={'Accept':'application/json', 
                                     'X-ELS-APIKey': {enterapikey}})

        results = resp.json()
        #response = requests.get(resp)
        print(results.get('abstracts-retrieval-response').get('coredata').get('prism:doi'))
    
    except:
        print('pass')
        #print(i, results, resp.headers)    

        
testing_end_sp = timer()
print(testing_end_sp - testing_start_sp)

In [None]:
# Part 3 -- do not run
# run APIs to find out whether a record exists in a source

included_sp =[]

start_sp = timer()


for i in doilist_sp:
    try:
        resp = requests.get("https://api.elsevier.com/content/abstract/doi/" + i + "?field=doi",
                            headers={'Accept':'application/json', 
                                     'X-ELS-APIKey': {enterapikey}})

        results = resp.json()
        results.get('abstracts-retrieval-response').get('coredata').get('prism:doi')
        included_sp.append(i)

    except:
        print(i)

        
end_sp = timer()
print(end_sp - start_sp)

### Web of Science

In [None]:
# Part 2
# test APIs using a small set of dataset 

headers = {'X-APIKey': {enterapikey}}
baseurl = "https://wos-api.clarivate.com/api/wos"  # this is the base URL for WoS Expanded API

testing_start_sp = timer()


for i in doilist_wos[0:10]:

    search_query = 'DO=("' + i + '")'

    try:
        
        initial_response = requests.get(
            'https://wos-api.clarivate.com/api/wos?databaseId=WOS&usrQuery=' + search_query + '&count=0&firstRecord=1',
            headers=headers)

        data = initial_response.json()

        if data['QueryResult']['RecordsFound'] == 1: 
            print(i)   

    except:
        print(data)   
    
    
testing_end_sp = timer()
print(testing_end_sp - testing_start_sp)

In [None]:
# Part 3 -- do not run
# run APIs to find out whether a record exists in a source

included_wos =[]

start_wos = timer()


for i in doilist_wos:
    
    search_query = 'DO=("' + i + '")'
    
    try:

        initial_response = requests.get(
            'https://wos-api.clarivate.com/api/wos?databaseId=WOS&usrQuery=' + search_query + '&count=0&firstRecord=1',
            headers=headers)

        data = initial_response.json()

        if data['QueryResult']['RecordsFound'] == 1: 
            included_wos.append(i)

    except:
        print(i)

        
end_wos = timer()
print(end_wos - start_wos)

In [None]:
# Part 4 -- do not run
# return the retracted items are found in each source

# Output File: Three CSV files (one for each source) 
# of the items from the known retraction list 
# that are covered by a given source but not indexed as retracted in that source 


# Crossref
included_cr_df = pd.DataFrame()

for i in included_cr:
    included_cr_df = pd.concat([cr[cr['DOI']== i], included_cr_df.loc[:]]).reset_index(drop=True)

if len(included_cr) == len(included_cr_df):
    included_cr_df.to_csv(box_path_2 + today + 'notindexedasretracted-crossref.csv')
    print(len(included_cr))

else:
    print('error: wrong length' + len(included_wos),len(included_wos_df))   
    
    
# Scopus
included_sp_df = pd.DataFrame()

for i in included_sp:
    included_sp_df = pd.concat([sp[sp['DOI']== i], included_sp_df.loc[:]]).reset_index(drop=True)

if len(included_sp) == len(included_sp_df):
    print(len(included_sp))
    included_sp_df.to_csv(box_path_2 + today + '-notindexedasretracted-scopus.csv')

else:
    print('error: wrong length' + len(included_sp),len(included_sp_df))

    
# Web of science
included_wos_df = pd.DataFrame()

for i in included_wos:
    included_wos_df = pd.concat([wos[wos['DOI']== i], included_wos_df.loc[:]]).reset_index(drop=True)


if len(included_wos) == len(included_wos_df):
    included_wos_df.to_csv(box_path_2 + today + '-notindexedasretracted-webofscience.csv')
    print(len(included_wos))

else:
    print('error: wrong length' + len(included_wos),len(included_wos_df))   