# Step 3: Data Collection of Indexed as retracted and Covered in

This notebook contains code for finding items covered but not indexed in the database sources.

Input File: 
   - retracted publications from each source
   - unionlist of retracted publications
       - unionlist/unionlist_with_nodoi_{date}.csv

Output File: 
   - items not covered in each source
   - items not indexed but covered in each source
   - items not indexed nor found in each source (for PubMed items alone)
       - coverednotindexed/pubmed_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/compendex_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/geobase_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/georef_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/inspec_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/scopus_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/webofsciencecore_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/bci_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/bioabs_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/ccc_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/medline_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/zoorec_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/crossref_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/sciencedirect_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/ieee_coverednotindexed_{coverage_date}.csv
       - coverednotindexed/ads_coverednotindexed_{coverage_date}.csv


In [None]:
import requests
from bs4 import BeautifulSoup as bs
import time,datetime
import re

import os
import csv
import numpy as np
import unicodedata
import json

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from urllib.parse import urlencode # Need for ADS databases platform
from urllib.parse import quote_plus as url_encode

### Directory Setup

In [None]:
# Targeting the retraction_index_path
retraction_index_path = os.path.abspath('./.')
retraction_index_path

data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'

# Create 'coverednotindexed' folder for coverage check in sources
if not os.path.exists(data_dir+'coverednotindexed'): #(data+source)
    os.mkdir(data_dir+'coverednotindexed')

### Configuration File

In [None]:
# Load configuration
con_file = open(retraction_index_path+"/config.json")
config = json.load(con_file)
con_file.close()

# Initializing variable for configuration file
my_email = config['my_email']
elsevier_api_key = config['Elsevier_APIKEY']
elsevier_insttoken = config['insttoken']
ieee_xplore_api_key = config['IEEEXplore_APIKEY']
wos_api_key = config['WoS_APIKEY']
ads_api_key= config['ADS']

In [None]:
# Global initializatiion
global my_email
global elsevier_api_key
global elsevier_insttoken
global ieee_xplore_api_key
global wos_api_key
global ads_api_key

In [None]:
# Dates coverage for unionlist DOIs were checked in each source
date_coverage = {
            'ads': '2024-08-15',
            'crossref':'2024-08-06',
            'retractionwatch': '', # Retraction Watch only covers retracted publications, all of which are in the unionlist. Its coverage does not need to be checked.
            'pubmed': '2024-07-26',
            'geobase': '2024-07-29',
            'compendex': '2024-07-29',
            'georef':'2024-07-30',
            'inspec': '2024-08-02',
            'scopus': '2024-08-02',
            'ieee': '2024-08-07',
                
            'bci': '2024-08-02',
            'bioabs': '2024-08-13',
            'ccc': '2024-08-05',
            'medline': '2024-08-05',
            'zoorec': '2024-08-05',
            'unionlist':'', # Unionlist date hardcoded in cell below.
            'webofsciencecore': '2024-07-30',
            'sciencedirect': '2024-08-05'}

In [None]:
"""
Load Unionlist
"""

unionlist = pd.read_csv(data_dir+f"/unionlist/unionlist_with_nodoi_2024-07-09.csv").drop('Unnamed: 0',axis=1)
unionlist['PubMedID']= unionlist['PubMedID'].fillna(0).astype(int)\
                .replace(0,'').astype(str).str.strip()
unionlist['DOI'].str.strip()
unionlist

### Filtering Unindexed Retracted Publications List for each Sources

For each source except for PubMed, we use the unionlist to filter the DOI of items not found in that source. For Pubmed, we use PMID because some items have no DOI but have PMID.

Output files:
- source_doinotindexed e.g. crossref_doinotindexed.csv


In [None]:
indexed_sources = ['BCI', 'BIOABS', 'CCC', 'Compendex', 'Crossref', 'GEOBASE', 'Medline',
                   'PubMed', 'Retraction Watch', 'Scopus', 'WoS_Core']

In [None]:
"""
Using DOI as identifier to extract items  (except PubMed in which PubMedID is used) not indexed as 
retracted publications for each source from the union_list : 
BCI','BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'Medline','PubMed', 'Retraction Watch', 'Scopus', 'WoS_Core

Output overall: 'source_doi_notindexedasretracted.csv'

"""

bciDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'BCI',regex=True, na=False)]

bioabsDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'BIOABS',regex=True, na=False)]

cccDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'CCC',regex=True, na=False)]

compendexDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'Compendex',regex=True, na=False)]

crossrefDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'Crossref',regex=True, na=False)]


geobaseDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'GEOBASE',regex=True, na=False)]


medlineDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'Medline',regex=True, na=False)]

retractionwatchDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'Retraction Watch',regex=True, na=False)]

woscoreDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'WoS_Core',regex=True, na=False)]

scopusDOI_notindexed= unionlist[~unionlist['source'].str.contains(r'Scopus',regex=True, na=False)]


# Using PubMedID for PubMed: 
pubmedDOI_notindexed_temp = unionlist[~unionlist['source'].str.contains(r'PubMed',regex=True, na=False)]
pubmedDOI_notindexed_temp['PubMedID']= pubmedDOI_notindexed_temp['PubMedID'].str.strip()
pubmedDOI_notindexed = pubmedDOI_notindexed_temp[pubmedDOI_notindexed_temp['PubMedID']!='']


In [None]:
unionlist[unionlist['source'].str.contains(r'Scopus',regex=True, na=False)]

## Finding items that are covered (i.e. can be found) in the sources

In [None]:
def batch_items(pmids:list, cut:int)-> list[list]:
    """
    It divides the list of items into batches for processing. 
    :param pmids: list of items 
    :param cut: maximum number of records to assign to a batch
    
    :return: list of pmid batches (batches are also lists)
    """
    pmids_batches=[]
    
    while len(pmids) >= cut:
        selected_pmids= pmids[:cut]
        pmids_batches.append(selected_pmids)
#         print(selected_pmids)    
        pmids = pmids[cut:]

    if pmids:
        pmids_batches.append(pmids)
#         print(pmids)

    return pmids_batches



In [None]:
def get_DOIs(df):
    """
    It filters the DataFrame to select valid DOI
    
    :param df: DataFrame to work on 
    :return: list - list of valid DOIs within df
    """
    df_filtered= df[df['DOI'].str.startswith('10')]
    dois_list = df_filtered['DOI'].tolist()
    
    return dois_list

### PubMed

In [None]:
def check_pubmed_doi(pmids:list):
    """
    It will check the PMIDs of items in the PMID list.
    
    :param pmids: list of pmids
    """
    
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    
    email = my_email # Supply your email
    
    params = {
        "db": "pubmed",
        "term": pmids,
        "retmode": "json",
        "retmax": 10000,  # Maximum number of results per request
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()

    return data

In [None]:
# pubmedDOI_notindexed
"""
It calls function: check_pubmed_doi to confirm if the item's PMID is found from PubMed. It stores the results 
in pmids_covered_not_indexed, which is used to create final pubmed_doi_covered_notindexedasretracted.csv.

Input: pubmedDOI_notindexed['PubMedID'].tolist() # List of PubMedIDs
Output: 'pubmed_doi_covered_notindexedasretracted.csv'
"""

pmids_list = list(set(pubmedDOI_notindexed['PubMedID'].tolist()))
print(f"The total unique numbers of PubMedIDs to check coverage in PubMed is {len(pmids_list)}")

# break PMIDs in batches
pmid_batches = batch_items(pmids_list,300)

pmids_covered_not_indexed=[]
for batch in pmid_batches:
    dois = ','.join(batch)
    
    # Checking the PubMedIDs in PubMed
    result= check_pubmed_doi(dois) 
    pmids_covered_not_indexed +=result['esearchresult']['idlist']

In [None]:
"""
Processing the retrieved covered items in PubMed:

Filter PubMedIDs that have PubMed in the Unionlist.

This is because the PubMedID can be messy. We can have many situations where the same PubMedID appears in multiple
sources that include PubMed in one record and do not include PubMed in another record. Such as '25684504','15370385'
Run a check if you would like an example:
    unionlist[unionlist['PubMedID']== '15370385']
This implies a PubMedID may appear as both indexed source and not indexed for different source
    
Hence, we filtered out 'PubMed' source from the result of the coverage check in pmids_covered_not_indexed
"""

pubmed_notin_unionlist= unionlist[unionlist['PubMedID'].isin(pmids_covered_not_indexed) & \
          (~unionlist['source'].str.contains('PubMed'))]

pubmed_notin_unionlist\
#     .to_csv(data_dir+'coverednotindexed/pubmed_coverednotindexed_'+date_coverage['pubmed']+'.csv')


In [None]:
# Example showing how PMID is inconsistent--PubMed is listed as a source in one version but not in the other.
unionlist[unionlist['PubMedID']== '15370385']

In [None]:
# Show sample of duplicated PMIDs from Unionlist, investigate first few.
unionlist[unionlist['PubMedID'].duplicated(keep=False)]

In [None]:
pd.set_option('display.max_colwidth', None)
unionlist[unionlist['PubMedID']== '8989457']

In [None]:
"""
Error Check Encountered:

First Pass: checking DOIs for coverage in PubMed on one-by-one basis

PMID 27258211 
- '10.1001/jama.2016.7190' - Scopus, retraction notice
- '10.1001/jama.296.4.396-a' - Retraction Watch, letter from authors re: retracted article 
'Inaccurate Description of Collaborating Hospitals in a Study of the Effect of Folate and Mecobalamin on Hip Fractures After Stroke'
- '10.1001/jama.2016.7190' - shows up when manually searching PubMed but not in unionlist.

PMID 8989457
- 10.1002/(sici)1096-8628(19961230)66:4403::aid-ajmg43.0.co;2-l - Retraction Watch, DOI not found when searched
- 10.1002/(sici)1096-8628(19961230)66:4<403::aid-ajmg4>3.0.co;2-l - CCC, PubMed, Medline, WoS_Core, DOI resolves to
regular article, not flagged as retracted when accessed via DOI only.
"""
pass

 #  Engineering Village

In [None]:
def extract_metadata_ev(results):   
    """
    This function extracts data from the metadata result from Engineering Village API
    
    :param results: results return from the Engineering Village search
    """
    
    store = []

    for result in results:
        try:
            metadata= result['EI-DOCUMENT'] # put in paper details

            doi= metadata['DOCUMENTPROPERTIES'].get('DO',"")
            document_type=metadata['DOCUMENTPROPERTIES'].get('DT',"")
            complete_year= metadata['DOCUMENTPROPERTIES'].get('SD',"")
            year= metadata['DOCUMENTPROPERTIES'].get('YR',0)
            title= metadata['DOCUMENTPROPERTIES'].get('TI',"")
            journal_title= metadata['DOCUMENTPROPERTIES'].get('SO',"")

            database = metadata['DOC']['DB'].get('DBNAME',"")
            affiliation = metadata.get('AFS',"")
            author= metadata.get('AFS',"")

            store.append([doi,document_type,year,complete_year,title, journal_title, database,
                          author, affiliation])

        except Exception:
            pass
        
    return store

In [None]:
def ev_all_DBs_check_DOI2(DOIs_lists: list, database: str)-> list:
    """
    It searches and fetches records from Engineering Village API by DOI listed. It does so in batches.
    
    :params DOIs: list of list DOIs to be searched via the API
    
    :params database: indicate which of the databases e.g. 'c' indicates Compendex
    See: https://dev.elsevier.com/documentation/EngineeringVillageAPI.wadl
    
        c - Compendex/EI Backfile
        i - Inspec/Inspec Archive
        n - NTIS
        pc - Paperchem
        cm - Chimica
        cb - CBNB
        el - EnCompassLIT
        ep - EnCompassPAT
        g - GEOBASE
        f - GeoRef
        u - US Patents
        e - EP Patents
        w - WO Patents
        k - Knovel
        
    """
    c= 0
    ev_covered_doi = []
    ev_unresolved_doi = []
    
    global elsevier_api_key
    global elsevier_insttoken
    # Set your API key
    api_key = elsevier_api_key
    elsevier_insttoken = elsevier_insttoken
    
    # Set the request headers with the API key
    headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json',
        'X-ELS-Insttoken': elsevier_insttoken
        }

        
    # Set the base URL for the API
    base_url = 'https://api.elsevier.com/content/ev/results?' 


    # DOI query format for coverage
    #     q2 = '(10.1002/2016WR020060 WN DOI) OR (10.1001/archdermatol.2012 WN DOI)'


    for batch in tqdm(DOIs_lists):
        
        # Formatting the DOIs in suitable Engineering Village searchable format
        batch_formatted = ['(' + item.strip() + ' WN DOI)' for item in batch]
        check_now_query = (' OR ').join(batch_formatted) # now formatted DOI query for EV search


        # API request
        response = requests.get(
                base_url,
                headers=headers,
                params=urlencode({'query': check_now_query ,
                        'database':database})) # comment the database parameters to retrieve from databases
        #params=urlencode(params)


        if response.status_code == 200:
            result_per_page = response.json()
            try: 
                if result_per_page['PAGE']['RESULTS-PER-PAGE'] is not None:

                    if result_per_page['PAGE']['RESULTS-COUNT'] > 0:
                        results = result_per_page['PAGE']['PAGE-RESULTS']['PAGE-ENTRY']
                        
                        # Extract metadata
                        metadata_result = extract_metadata_ev(results)
                        
                        ev_covered_doi.append(metadata_result)
                        
            except Exception:
                ev_unresolved_doi.append(batch)
                print(f"Error in processing, not API Request failure. Error in batch: {c}")
                pass

        else: 
            ev_unresolved_doi.append(batch)
            # If the request was not successful, print the error message
            print(f"Request failed with status code: {response.status_code} in batch: {c}")
        
#         print('batch :',c)
        time.sleep(0.10)
        c+=1
        
    
    return [ev_covered_doi, ev_unresolved_doi]

In [None]:
def ev_all_DBs_check_DOI(DOIs_list: list, database: str)-> list:
    """
    It searches and fetches records from Engineering Village API by DOI listed
    
    :params DOIs: list of DOIs to be searched via the API
    
    :params database: indicate which of the databases e.g. 'c' indicates Compendex
    See: https://dev.elsevier.com/documentation/EngineeringVillageAPI.wadl
    
        c - Compendex/EI Backfile
        i - Inspec/Inspec Archive
        n - NTIS
        pc - Paperchem
        cm - Chimica
        cb - CBNB
        el - EnCompassLIT
        ep - EnCompassPAT
        g - GEOBASE
        f - GeoRef
        u - US Patents
        e - EP Patents
        w - WO Patents
        k - Knovel
        
    """
    c= 0
    ev_covered_doi = []
    ev_unresolved_doi = []
    
    global elsevier_api_key
    global elsevier_insttoken
    # Set your API key
    api_key = elsevier_api_key
    elsevier_insttoken = elsevier_insttoken
    
    # Set the request headers with the API key
    headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json',
        'X-ELS-Insttoken':elsevier_insttoken
        }

        
    # Set the base URL for the API
    base_url = 'https://api.elsevier.com/content/ev/results?' 



    # DOI query format for coverage
    #     q2 = '(10.1002/2016WR020060 WN DOI) OR (10.1001/archdermatol.2012 WN DOI)'


  
        
    # Formatting the DOIs in suitable Engineering Village searchable format
    batch_formatted = ['(' + item.strip() + ' WN DOI)' for item in DOIs_list]
    check_now_query = (' OR ').join(batch_formatted) # now formatted DOI query for EV search


    # API request
    response = requests.get(base_url,
                            headers=headers,
                            params=urlencode({'query': check_now_query ,
                            'database':database}))


    if response.status_code == 200:
        result_per_page = response.json()
#         print(result_per_page)
        try: 
            if result_per_page['PAGE']['RESULTS-PER-PAGE'] is not None:

                if result_per_page['PAGE']['RESULTS-COUNT'] > 0:
                    results = result_per_page['PAGE']['PAGE-RESULTS']['PAGE-ENTRY']
                        
                    # Extract metadata
                    metadata_result = extract_metadata_ev(results)
                        
                    ev_covered_doi.append(metadata_result)
                        
        except Exception:
            ev_unresolved_doi.append(DOIs_list)
            print(f"Error in processing, not API Request failure. Error in batch: {c}")
            pass

    else: 
        ev_unresolved_doi.append(DOIs_list)
        # If the request was not successful, print the error message
        print(f"Request failed with status code: {response.status_code} in batch: {c}")
        
    
    return [ev_covered_doi, ev_unresolved_doi]

In [None]:
"""
Filtering Parsable DOIs. 

For Engineering Village DOI parsing. Some DOIs that contains any of these '()>;[\]' characters ran into error.
Such as:
'10.1002/(SICI)1097-0215(19980330)76:1154::AID-IJC243.0.CO;2-B'
'10.1002/1521-396X(200207)192:1<212::AID-PSSA212>3.0.CO;2-B'
'10.1016/0003-4975(93)90269-n'

Hence we separated those with the above characters into 'nonparsable_dois' and those that work fine into 'parsable'

We tried using escape characters for items with the problem characters listed above, but it did not work
For example, we added escape e.g: '10.1016/S2589-7004\(19\)30009-6' but it is not working as suggested in:
https://solr.apache.org/guide/8_11/the-standard-query-parser.html#escaping-special-characters
"""

def get_parsable_DOI(doi_list: list):
    """
    It collects doi that do not contain not any of these characters '()>;[\]' to coverage check at source API
    
    :param doi_list: list of DOI for a given source
    :return: parasable and nonparsable DOI
    """
    notparsable_dois=[]
    parsable_dois=[]
    pattern = r'[()>;[\]]'

    for doi in doi_list:
        if re.search(pattern, doi):
            notparsable_dois.append(doi)
        else:
            parsable_dois.append(doi)
    
    return [parsable_dois, notparsable_dois]


In [None]:
"""
Checking # of DOI that are parsable and nonparsable via  Engineering Village API
"""
ev_dois=get_DOIs(unionlist) 
parsable_ev_dois,nonparsable_ev_dois= get_parsable_DOI(ev_dois)

print(f'The total DOIs that are parsable via Engineering Village API is {len(parsable_ev_dois)}')
print(f'The total DOIs that are nonparsable via Engineering Village API is {len(nonparsable_ev_dois)}')


In [None]:
"""
Example: Failed test of nonparsable DOI
per Engineering Village API documentation: https://dev.elsevier.com/documentation/EngineeringVillageAPI.wadl
400 - Invalid Request - This is an error that occurs when invalid information is submitted. 
"""
#return [ev_covered_doi, ev_unresolved_doi]
ev_all_DBs_check_DOI(['10.1016/s0928-8244(02)00344-9'],'c')

In [None]:
"""
Example: Testing API for coverage check in Engineering Village, parseable DOIs

Uncomment any of the last two lines to test coverage of the API 
"""
ev_test_dois= ['10.1002/adma.202302631','10.7567/JJAP.55.05FH03','10.7863/ultra.32.11.2047','10.1001/jama.2014.7247']

ev_all_DBs_check_DOI2([ev_test_dois],'c') # For Compendex

# ev_all_DBs_check_DOI2([ev_test_dois],'g')  # For GEOBASE

 ## Compendex

In [None]:
"""
Checking availability of DOIs in Compendex 

Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Engineering Village is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
ev_cut= 50
check_doi_in_compendex= get_DOIs(compendexDOI_notindexed) 

# Extract parsable DOIs 
parsable_compendex_doi,nonparsable_compendex_doi= get_parsable_DOI(check_doi_in_compendex)

#Break DOIs into sets
check_doi_in_compendex_batches = batch_items(parsable_compendex_doi, ev_cut)

print(f'The total DOIs that are nonparsable is {len(nonparsable_compendex_doi)}')
print(f'The total items to search in Compendex is {len(parsable_compendex_doi)}, which are divided into {len(check_doi_in_compendex_batches)} batches')
print(f'The items list is divided into lists in which each list contains {ev_cut} records maximum')

In [None]:
# Compare parsable_compendex_doi and nonparsable_compendex_doi

print(parsable_compendex_doi[:6])

In [None]:
print(nonparsable_compendex_doi[:6])

In [None]:
"""
# Checking DOIs coverage in Compendex
"""

compendex_covered_doi, compendex_unresolved_doi= [],[]

for doi_list in tqdm(check_doi_in_compendex_batches[:]):
    compendex_result= ev_all_DBs_check_DOI(doi_list,'c')
    compendex_covered_doi.extend(compendex_result[0])
    compendex_unresolved_doi.extend(compendex_result[1])
    time.sleep(0.10)


In [None]:
compendex_covered_doi

In [None]:
compendex_unresolved_doi

In [None]:
"""
Processing the retrieved covered items in Compendex:

Filter DOIs that have Compendex in the Unionlist.
"""

count= 0
compendex_tempo=[]
for batch in compendex_covered_doi:
    compendex_tempo.extend(batch)
    count+=len(batch)
compendex_df_tempo= pd.DataFrame(compendex_tempo)
compendex_df_tempo


compendex_coveredInDOI_tempo= compendex_df_tempo.iloc[:,0].tolist()

compendex_coveredInDOI_tempo = [x.lower() for x in compendex_coveredInDOI_tempo]

# API can be messy: Ensure the check covered_notindexedasretracted DOI are not part of DOIs indexed as retracted in the source
compendex_notin_unionlist= unionlist[unionlist['DOI'].isin(compendex_coveredInDOI_tempo)  & \
          (~unionlist['source'].str.contains('Compendex'))]

compendex_notin_unionlist\
#     .to_csv(data_dir+'coverednotindexed/compendex_coverednotindexed_'+date_coverage['compendex']+'.csv')


 ## GEOBASE

In [None]:
# Checking availability of DOIs in GEOBASE 

"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Engineering Village is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""

ev_cut= 50
check_doi_in_geobase= get_DOIs(geobaseDOI_notindexed)
parsable_geobase_doi,nonparsable_geobase_doi= get_parsable_DOI(check_doi_in_geobase)

check_doi_in_geobase_batches = batch_items(parsable_geobase_doi, ev_cut)

print(f'The total DOIs that are nonparsable is {len(nonparsable_geobase_doi)}')
print(f'The total items to search in geobase is {len(parsable_geobase_doi)}, which are divided into {len(check_doi_in_geobase_batches)} batches')
print(f'The items list is divided into lists in which each list contains {ev_cut} records maximum')

In [None]:
"""
# Checking DOIs coverage in GEOBASE
"""
geobase_covered_doi, geobase_unresolved_doi= [],[]

"""
If case you run out of API requests, keep track and check your coverage in batch, and save it. 
Then merge your batch results after you complete the processing.

example of batch is check_doi_in_geobase_batches[:3001] i.e. 0 to 3000 (upper bound is excluded) DOI,
then next will start from [3001:]
"""

# If testing is needed, update batches to list 
# check_doi_in_geobase_batches[1573:1574] for testing

for doi_list in tqdm(check_doi_in_geobase_batches[:]):
    geobase_result= ev_all_DBs_check_DOI(doi_list,'g') # Checking the API
    geobase_covered_doi.extend(geobase_result[0])
    geobase_unresolved_doi.extend(geobase_result[1])
    time.sleep(0.10)

In [None]:
geobase_unresolved_doi

In [None]:
"""
Processing the retrieved covered items in GEOBASE:

Filter DOIs that have geobase in the Unionlist.
"""

geo_count= 0
geobase_tempo=[]
for batch in geobase_covered_doi: # geobase_covered_not_indexed[0] is 
    geobase_tempo.extend(batch)
    geo_count+=len(batch)
geobase_df_tempo= pd.DataFrame(geobase_tempo)
geobase_df_tempo


geobase_coveredInDOI_tempo= geobase_df_tempo.iloc[:,0].tolist() 

geobase_coveredInDOI_tempo = [x.lower() for x in geobase_coveredInDOI_tempo]

# API can be messy: Ensure the check covered_notindexedasretracted DOI are outside of our unionlist 
geobase_notin_unionlist= unionlist[unionlist['DOI'].isin(geobase_coveredInDOI_tempo)  & \
          (~unionlist['source'].str.contains('geobase'))]

geobase_notin_unionlist\
#     .to_csv(data_dir+'coverednotindexed/geobase_coverednotindexed_'+date_coverage['geobase']+'.csv')


In [None]:
"""
Example of API Messy:
For instance, checking coverage in GEOBASE retrieved the following DOIs below which are out of the scope
of the our Unionlist DOIs (i.e. DOIs not the Unionlist)

{'10.1002/2017jd027595',
 '10.1038/NGEO2118',
 '10.1039/d0se00033g',
 '10.1093/GJI/GGY267',
 '10.1130/G30584.1.',
 '10.1139/CJCE-2019-0536',
 '10.1190/GEO2013-0325.1',
 '10.1190/GEO2017-0028.1',
 '10.1215/1089201x-25-2-481'}
"""
'10.1139/CJCE-2019-0536' in parsable_geobase_doi

## GEOREF

In [None]:
"""
Checking availability of DOIs in GEOREF

Check the entire DOI list in the Unionlist since  GeoRef does not index retracted publication

Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Enigineering Village is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
ev_cut= 50
check_doi_in_georef= get_DOIs(unionlist) # Check the entire Unionlist

parsable_georef_doi,nonparsable_georef_doi= get_parsable_DOI(check_doi_in_georef)

check_doi_in_georef_batches = batch_items(parsable_georef_doi, ev_cut)

print(f'The total DOIs that are nonparsable is {len(nonparsable_georef_doi)}')
print(f'The total items to search in georef is {len(parsable_georef_doi)}, which are divided into {len(check_doi_in_georef_batches)} batches')
print(f'The items list is divided into lists in which each list contains {ev_cut} records maximum')

In [None]:
"""
# Checking DOIs coverage in Georef
"""

georef_covered_doi, georef_unresolved_doi= [],[]

# For testing - check_doi_in_georef_batches[40:41]

for doi_list in tqdm(check_doi_in_georef_batches[:]):
    georef_result= ev_all_DBs_check_DOI(doi_list,'f') # Checking the DOIs in Georef
    georef_covered_doi.extend(georef_result[0])
    georef_unresolved_doi.extend(georef_result[1])
    time.sleep(0.10)

In [None]:
"""
Processing the retrieved covered items in Georef:
"""
# Sometimes, break checking the coverage of DOIs into two or more rounds

# georef_df_tempo1= georef_df_tempo.copy(deep=True)   #check_doi_in_georef_batches[:801]
# georef_df_tempo2= georef_df_tempo.copy(deep=True)  #check_doi_in_georef_batches[801:]

# georef_df_tempo= pd.concat([georef_df_tempo1,georef_df_tempo2])

In [None]:
georef_unresolved_doi

In [None]:
"""
Processing the retrieved covered items in Georef:

Filter DOIs that have georef in the Unionlist.

"""
georef_count= 0
georef_tempo=[]
for batch in georef_covered_doi:
    georef_tempo.extend(batch)
    georef_count+=len(batch)
georef_df_tempo= pd.DataFrame(georef_tempo)
georef_df_tempo


georef_coveredInDOI_tempo= georef_df_tempo.iloc[:,0].tolist()

georef_coveredInDOI_tempo = [x.lower() for x in georef_coveredInDOI_tempo]


# API can be messy: Ensure the check covered_notindexedasretracted DOI are not outside of our unionlist 
georef_notin_unionlist= unionlist[unionlist['DOI'].isin(georef_coveredInDOI_tempo)]

georef_notin_unionlist\
#     .to_csv(data_dir+'coverednotindexed/georef_coverednotindexed_'+date_coverage['georef']+'.csv')

In [None]:
"""
Example of API Messy in Georef:
For instance, checking coverage in Georef retrieved the following DOIs below which are out of the scope
of the Unionlist DOIs (i.e. DOIs not the Unionlist)

{'10.1038/nature 02699'}
"""

 ## Inspec

In [None]:
"""
# Checking availability of DOIs in Inspec 

Check the entire DOI in the Unionlist since Inspec does not index retracted publication

Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Enigineering Village is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
ev_cut= 50
check_doi_in_inspec= get_DOIs(unionlist) # Check the entire Unionlist

parsable_inspec_doi,nonparsable_inspec_doi= get_parsable_DOI(check_doi_in_inspec)

check_doi_in_inspec_batches = batch_items(parsable_inspec_doi, ev_cut)

print(f'The total DOIs that are nonparsable is {len(nonparsable_inspec_doi)}')
print(f'The total items to search in inspec is {len(parsable_inspec_doi)}, which are divided into {len(check_doi_in_inspec_batches)} batches')
print(f'The items list is divided into lists in which each list contains {ev_cut} records maximum')

In [None]:
"""
# Checking DOIs coverage in Inspec

In case you ran out of API request, continue from where you left after 24 hours and merge your results
"""

inspec_covered_doi, inspec_unresolved_doi= [],[]
for doi_list in tqdm(check_doi_in_inspec_batches[:]):
    inspec_result= ev_all_DBs_check_DOI(doi_list,'i')
    inspec_covered_doi.extend(inspec_result[0])
    inspec_unresolved_doi.extend(inspec_result[1])
    time.sleep(0.10)

In [None]:
"""
Processing the retrieved covered items in Inspec:
"""

inspec_count= 0
inspec_tempo=[]
for batch in inspec_covered_doi: 
    inspec_tempo.extend(batch)
    inspec_count+=len(batch)
inspec_df_tempo= pd.DataFrame(inspec_tempo)
inspec_df_tempo
#inspec_df_tempo1= inspec_df_tempo.copy(deep=True)  #9785
#inspec_df_tempo2= inspec_df_tempo.copy(deep=True) #4027
#inspec_df_tempo3= inspec_df_tempo.copy(deep=True) #575

In [None]:
"""
Processing the retrieved covered items in Inspec:
"""

# inspec_df_tempo = inspec_df_tempo1[0].tolist() + inspec_df_tempo2[0].tolist() + inspec_df_tempo3[0].tolist()

inspec_coveredInDOI_tempo= inspec_df_tempo.iloc[:,0].tolist() 

inspec_coveredInDOI_tempo = [x.lower() for x in inspec_coveredInDOI_tempo]

# API can be messy: Ensure the check covered_notindexedasretracted DOI are outside of our unionlist 
inspec_notin_unionlist= unionlist[unionlist['DOI'].isin(inspec_coveredInDOI_tempo)]

inspec_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/inspec_coverednotindexed_'+date_coverage['inspec']+'.csv')


In [None]:
"""
Example of API Messy in Inspec :
For instance, in prior runs, checking coverage in Inspec retrieved the following DOIs (41) below which are out of 
the scope of our Unoinlist DOIs (i.e. DOIs not the Unionlist)

{'10.1007/S10853-006-1486-5', '10.1117/1.JEI.31.6.061802', '10.1039/c8tc03423k', '10.1039/c5ta01191d', 
'10.1007/BF00613233', '10.1039/c7ta02733h', '10.1111/J.1467-6486.2010.00994.X', '10.1007/s12083-021-01138-x', 
'10.1039/c6ta08172j', '10.1039/c6ta11168h', '10.1039/c5ta04288g', '10.1557/JMR.2006.0380', 
'10.1190/GEO2013-0325.1', '10.1039/c6ta07859a', '10.1038/NMAT3256', '10.1039/c3cy00214d', 
'10.1039/c3tb21558j', '10.1039/c3tc00082f', '10.1038/NPHOTON.2010.2', '10.1039/c7ta05459a', 
'10.1039/c7tc03449k', '10.1109/ICETA.2011.6112609', '10.1039/c7ta02116j', '10.1134/S0021364018140138', 
'10.1515/secm-2012-0053', '10.1134/S1063772918130012', '10.1109/ICNDS.2010.5479348', '10.1039/c7py01218g', 
'10.1039/c3ta13906a', '10.1557/JMR.2007.0087', '10.1039/c6tb01306f', '10.1039/d1qi00733e', '10.1039/c2nr30460k',
'10.1039/c2ta00015f', '10.1039/c5ta06387f', '10.1039/c7ta04927g', '10.1039/c3tb20262c', '10.1039/c2jm16106k', 
'10.1039/c4cy01331j', '10.1039/c3tb21363c', '10.4028/www.scientific.net/AMM.217.219.1077'}

"""

set(inspec_coveredInDOI_tempo) - set(inspec_notin_unionlist['DOI'])

#'10.1007/S10853-006-1486-5' in unionlist['DOI'].tolist()

In [None]:
inspec_unresolved_doi

# Scopus

In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 25 items in each batch. 
The limit per page for Scopus is 25 items at a time.

Input:
    cut: maximum number items in a batch
"""
scopus_cut= 25
check_doi_in_scopus= scopusDOI_notindexed['DOI'].tolist() # change scopus to notindexedasretracted_source

# Extract parsable DOIs 
# parsable_scopus_doi,nonparsable_scopus_doi= get_parsable_DOI(check_doi_in_scopus)

check_doi_in_scopus_batches = batch_items(check_doi_in_scopus, scopus_cut) #check_doi_in_scopus


print(f'The total items to search in Scopus is {len(check_doi_in_scopus)}, which are divided into {len(check_doi_in_scopus_batches)} batches')
print(f'The items list is divided into lists in which each list contains {scopus_cut} records maximum')

In [None]:
"""
Checking DOIs coverage in Scopus
"""

scopus_covered_doi = []
scopus_unresolved_doi = []

# Set your API key
api_key = elsevier_api_key
elsevier_insttoken = elsevier_insttoken

url_base = "https://api.elsevier.com/content/search/scopus" 
headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json',
        'X-ELS-Insttoken':elsevier_insttoken}


for batch in tqdm(check_doi_in_scopus_batches[:]):
    
    # Putting search format 'DOI(10.1038/s41598-023-31439-5) OR DOI(...)'
    check_now = ' OR '.join(f'DOI({doi})' for doi in batch) 
    params= {"query": check_now}
        
    response = requests.get(url_base,
                        headers= headers,
                    params = params)
    
    if response.status_code == 200:
        results = response.json()

        store=[]
            
        totalresult= int(results['search-results'].get('opensearch:totalResults',0))
    
        if totalresult > 0:
        
            try:
                for result in results['search-results']['entry']:
                    try:
                        store.append(result['prism:doi'])
                    except KeyError:
                        pass
                        scopus_unresolved_doi.extend(batch)
            except KeyError:
                pass
    else:
        scopus_unresolved_doi.extend(batch)

    scopus_covered_doi.extend(store)
    
    time.sleep(0.15)


In [None]:
"""
Processing the retrieved covered items in Scopus:

Filter DOIs that have Scopus in the Unionlist.
"""

scopus_coveredInDOI_tempo= scopus_covered_doi

scopus_coveredInDOI_tempo= [x.strip().lower() for x in scopus_coveredInDOI_tempo]

# API can be messy: Ensure the check covered_notindexedasretracted DOI are outside our unionlist 
scopus_notin_unionlist= unionlist[unionlist['DOI'].isin(scopus_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('Scopus'))]

scopus_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/scopus_coverednotindexed_'+date_coverage['scopus']+'.csv')


In [None]:
# No out scope doi in scopus coverage

In [None]:
set(scopus_coveredInDOI_tempo) - set(scopus_notin_unionlist['DOI'])

# Web of Science Platform

In [None]:
def wos_all_DBs_check_DOI(DOIs_lists: list, database:str):
    """
    It checks the coverage of DOI in the Web of Science
    
    :param DOIs_lists: list of list of DOIs preformatted to query form for Web of Science use
    :param database: specifies the database to search
    
    :return: list of lists [available DOIs, DOIs that ran into error]
    
    """
    c=0
    wos_covered_doi = []
    wos_unresolved_doi = []

    # Define your API key
    WoS_api_key = wos_api_key


    # Set the headers with the API key
    headers = {
            'X-ApiKey': WoS_api_key,
            'charset': 'UTF-8',
            'Encoding': 'UTF-8',
            'content-type':'text/xml'
            }
#     base_url = 'https://api.clarivate.com/apis/wos-starter/v1/documents' 
    url = 'https://api.clarivate.com/apis/wos-starter/v1/documents'

    for batch in tqdm(DOIs_lists):
        check_now = ' OR '.join(f'DO=({doi})' for doi in batch) 
#         print(check_now)

        params = {
            'db': database, #'WOK',#'WOS',
            'q': check_now,
            'limit': 50
        }



        # Make the API request
        response = requests.get(url, params=params, headers=headers)

        # Check if the request was successful (200 status code)
        if response.status_code == 200:
#             print('success')
            # Extract the response content as JSON
            data = response.json()
            # Print the DOI details
            try:
                dois_result = data['hits']
                for doi in dois_result:
                    try:
                        if doi.get('identifiers'):
                            identifiers = doi.get('identifiers','')
                            DOI= identifiers.get('doi','')
                            wos_covered_doi.append(DOI)
                    except Exception:
                        pass
            except Exception:
                pass
                wos_unresolved_doi.append(check_now)

        else:
            # If the request was not successful, print the error message
            print(f"Request failed with status code: {response.status_code} in batch: {c}")
            #print(response.text)

        time.sleep(0.15)
        #print('batch: ',c)
        c+=1 
        
    return [wos_covered_doi,wos_unresolved_doi]



### Web of Science Core

In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50
check_doi_in_woscore= get_DOIs(woscoreDOI_notindexed) #woscoreDOI_notindexed['DOI'].tolist()# change doilist_wos to notindexedasretracted_source

check_doi_in_woscore_batches = batch_items(check_doi_in_woscore, wos_cut)

print(f'The total items to search in Web of Science Core is {len(check_doi_in_woscore)}, which are divided into {len(check_doi_in_woscore_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Check DOIs Coverage in Web of Science Core
"""
start = time.time()

woscore_results= wos_all_DBs_check_DOI(check_doi_in_woscore_batches, 'WOS')

end = time.time()
end - start

In [None]:
"""
Processing the retrieved covered items in Web of Science Core:

Filter DOIs that have Web of Science Core in the Unionlist.
"""

wos_core_coveredInDOI_tempo= woscore_results[0] 

wos_core_coveredInDOI_tempo= [x.strip().lower() for x in wos_core_coveredInDOI_tempo]



wos_core_notin_unionlist= unionlist[unionlist['DOI'].isin(wos_core_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('WoS_Core'))]

wos_core_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/webofsciencecore_coverednotindexed_'+date_coverage['webofsciencecore']+'.csv')

### BCI - BIOSIS Citation Index

In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50
check_doi_in_bci= get_DOIs(bciDOI_notindexed) #bciDOI_notindexed['DOI'].tolist()# change doilist_bci to notindexedasretracted_source

# Remove this DOI causing WoS API Error  - status code: 400
rm_doi = '10.1061/(asce)0733-9399(2010)136:2(174)'
check_doi_in_bci = [x for x in check_doi_in_bci if x != rm_doi]

check_doi_in_bci_batches = batch_items(check_doi_in_bci, wos_cut) #check_doi_in_bci_batches

print(f'The total items to search in BCI is {len(check_doi_in_bci)}, which are divided into {len(check_doi_in_bci_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Troubleshooting errors in DOIs:
Previous run:
batch351: '10.1038/embor.2009.88 |' caused error. From Retraction Watch.
batch393: '10.1061/(ASCE)0733-9399(2010)136:2(174)' caused error. From Retraction Watch and WoS_Core.
Current run:
batch491: '10.1061/(asce)0733-9399(2010)136:2(174)' caused error. From WoS_Core and Retraction Watch.

# Troublshooting
check_doi_in_bci_batches[411]
unionlist[unionlist['DOI']=='10.1061/(ASCE)0733-9399(2010)136:2(174)']
"""
# wos_all_DBs_check_DOI([['10.1038/embor.2009.88 |']], 'BCI') #
wos_all_DBs_check_DOI([['10.1061/(asce)0733-9399(2010)136:2(174)']], 'BCI')

In [None]:
unionlist[unionlist['DOI']=='10.1061/(asce)0733-9399(2010)136:2(174)']

In [None]:
unionlist[unionlist['DOI']=='10.1038/embor.2009.88 |']

In [None]:
"""
Check DOIs Coverage in BCI
"""
start = time.time()

bci_results= wos_all_DBs_check_DOI(check_doi_in_bci_batches, 'BCI')

end = time.time()
end - start

In [None]:
"""
Processing the retrieved covered items in BIOSIS Citation Index

Filter DOIs that have BCI in the Unionlist.
"""

bci_coveredInDOI_tempo= bci_results[0] 

bci_coveredInDOI_tempo= [x.strip().lower() for x in bci_coveredInDOI_tempo]

bci_notin_unionlist= unionlist[unionlist['DOI'].isin(bci_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('BCI'))]

bci_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/bci_coverednotindexed_'+date_coverage['bci']+'.csv')

In [None]:
set(bci_coveredInDOI_tempo) - set(bci_notin_unionlist['DOI'])

### BIOABS - Biological Abstracts

In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50
check_doi_in_bioabs= get_DOIs(bioabsDOI_notindexed) # get valid DOIs from bioabsDOI_notindexed #['DOI'].tolist()# change doilist_bioabs to notindexedasretracted_source

# Remove this DOI causing WoS API Error  - status code: 400
rm_doi = '10.1061/(asce)0733-9399(2010)136:2(174)'
check_doi_in_bioabs = [x for x in check_doi_in_bioabs if x != rm_doi]

check_doi_in_bioabs_batches = batch_items(check_doi_in_bioabs, wos_cut) #check_doi_in_bioabs_batches

print(f'The total items to search in BIOABS is {len(check_doi_in_bioabs)}, which are divided into {len(check_doi_in_bioabs_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Check DOIs Coverage in BIOABS
"""
start = time.time()

bioabs_results= wos_all_DBs_check_DOI(check_doi_in_bioabs_batches, 'BIOABS')

end = time.time()
end - start

In [None]:
"""
Processing the retrieved covered items in Biological Abstracts

Filter DOIs that have Biological Abstract in the Unionlist.
"""

bioabs_coveredInDOI_tempo= bioabs_results[0]

bioabs_coveredInDOI_tempo = [x.lower() for x in bioabs_coveredInDOI_tempo]

bioabs_notin_unionlist= unionlist[unionlist['DOI'].isin(bioabs_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('BIOABS'))]

bioabs_notin_unionlist\
#     .to_csv(data_dir+'coverednotindexed/bioabs_coverednotindexed_'+date_coverage['bioabs']+'.csv')

In [None]:
set(bioabs_coveredInDOI_tempo) - set(bioabs_notin_unionlist['DOI'])

In [None]:
"""API Messy in BIOABS
Returned DOI 10.1002/cbic.201190007 when not in the DOIs originally requested.
"""

### MEDLINE - Medical Literature Analysis and Retrieval System Online

In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50
check_doi_in_medline=  get_DOIs(medlineDOI_notindexed)

# Remove DOIs that cause error
error_dois= ['10.1038/embor.2009.88 |','10.1061/(asce)0733-9399(2010)136:2(174)']
check_doi_in_medline[:] = [doi for doi in check_doi_in_medline if doi not in error_dois]

check_doi_in_medline_batches = batch_items(check_doi_in_medline, wos_cut) #check_doi_in_woscore_batches

print(f'The total items to search in Medline is {len(check_doi_in_medline)}, which are divided into {len(check_doi_in_medline_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Check DOIs Coverage in Medline
"""
start = time.time()

medline_results= wos_all_DBs_check_DOI(check_doi_in_medline_batches[:], 'MEDLINE')

end = time.time()
end - start  

In [None]:
"""
Processing the retrieved covered items in Medline

Filter DOIs that have Medline in the Unionlist.
"""

medline_coveredInDOI_tempo= medline_results[0]

medline_coveredInDOI_tempo= [x.lower() for x in medline_coveredInDOI_tempo]

medline_notin_unionlist= unionlist[unionlist['DOI'].isin(medline_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('Medline'))]

medline_notin_unionlist \
#       .to_csv(data_dir+'coverednotindexed/medline_coverednotindexed_'+date_coverage['medline']+'.csv')

In [None]:
set(medline_coveredInDOI_tempo) - set(medline_notin_unionlist['DOI'])

### CCC - Current Contents Connect

In [None]:
def wos_ccc_check_DOI(DOIs_lists: list):
    """
    It checks the coverage of DOI in the Current Contents Connect (1998-present) only 
    CCC is part of databases in the Web of Science Platform
    
    :param DOIs_lists: list of list of DOIs preformatted to query form for Web of Science use
    :param database: specific the database to search, preset to Web of Knowledge (all WoS Platform)
    
    :return: list of list [available DOIs, DOIs that ran into error]
    
    """
    c=0
    wos_covered_doi = []
    wos_unresolved_doi = []

    # Define your API key
    WoS_api_key = wos_api_key


    # Set the headers with the API key
    headers = {
            'X-ApiKey': WoS_api_key,
            'charset': 'UTF-8',
            'Encoding': 'UTF-8',
            'content-type':'text/xml'
            }
    url = 'https://api.clarivate.com/apis/wos-starter/v1/documents'

    for batch in tqdm(DOIs_lists):
        formatted_dois= ' OR '.join(f'DO=({doi})' for doi in batch) 
#         print(formatted_dois)
        
        # Searching the WOK database and filtering for result with CCC database
        query= f"{formatted_dois} AND UT=CCC:*" 

        params = {
            'db': 'WOK', #'WOK',#'WOS',
            'q': query,
            'limit': 50
        }



        # Make the API request
        response = requests.get(url, params=params, headers=headers)

        # Check if the request was successful (200 status code)
        if response.status_code == 200:
            # Extract the response content as JSON
            data = response.json()
#             print(data)
            # Print the DOI details
            try:
                dois_result = data['hits']
                for doi in dois_result:
                    try:
                        if doi.get('identifiers'):
                            identifiers = doi.get('identifiers','')
                            DOI= identifiers.get('doi','')
                            wos_covered_doi.append(DOI) #wos_covered_doi
                    except Exception:
                        pass
            except Exception:
                pass
                wos_unresolved_doi.append(check_now)
        else:
            # If the request was not successful, print the error message
            print(f"Request failed with status code: {response.status_code} in batch: {c}")
            #print(response.text)

        time.sleep(0.20)
        #print('batch: ',c)
        c+=1 
        
    return [wos_covered_doi,wos_unresolved_doi]


In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50
check_doi_in_ccc= get_DOIs(cccDOI_notindexed) # get valid DOIs from cccDOI_notindexed

# Remove DOIs that cause error
error_dois= ['10.1038/embor.2009.88 |','10.1061/(asce)0733-9399(2010)136:2(174)']
check_doi_in_ccc[:] = [doi for doi in check_doi_in_ccc if doi not in error_dois]

check_doi_in_ccc_batches = batch_items(check_doi_in_ccc, wos_cut) #check_doi_in_ccc_batches

print(f'The total items to search in CCC is {len(check_doi_in_ccc)}, which are divided into {len(check_doi_in_ccc_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Check DOIs Coverage in CCC
"""
ccc_results= wos_ccc_check_DOI(check_doi_in_ccc_batches[:])

In [None]:
"""Example of prior batching"""
# ccc_results_tempo1= ccc_results[0]  #17874
# ccc_results_tempo2= ccc_results[0] #14526
# ccc_results_tempo3= ccc_results[0] #17509

In [None]:
"""
Processing the retrieved covered items in Current Contents Connect

Filter DOIs that have CCC in the Unionlist.
"""

ccc_coveredInDOI_tempo=  ccc_results[0] # +ccc_results_tempo1[0] +ccc_results_tempo2[0] + ccc_results_tempo3[0]

ccc_coveredInDOI_tempo = [x.lower() for x in ccc_coveredInDOI_tempo] # Put all DOIs covered in lowercase


ccc_notin_unionlist= unionlist[unionlist['DOI'].isin(ccc_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('CCC'))]

ccc_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/ccc_coverednotindexed_'+date_coverage['ccc']+'.csv')

In [None]:
set(ccc_coveredInDOI_tempo) - set(ccc_notin_unionlist['DOI'])

In [None]:
len(set(ccc_coveredInDOI_tempo) - set(ccc_notin_unionlist['DOI']))

### Zoological Record

In [None]:
def wos_zoorec_check_DOI(PMIDs_lists: list):
    """
    It checks the coverage of DOI in the Zoological Record (1864-present) only using PubMedID
    ZOOREC is part of databases in the Web of Science Platform
    
    :param DOIs_lists: list of list of DOIs preformatted to query form for Web of Science use
    :param database: specific the database to search
    
    :return: list of list [available DOIs, DOIs that ran into error]   
    """
    c=0

    wos_covered_pmid= []
    wos_unresolved_pmid= []

    # Define your API key
    WoS_api_key = wos_api_key


    # Set the headers with the API key
    headers = {
            'X-ApiKey': WoS_api_key,
            'charset': 'UTF-8',
            'Encoding': 'UTF-8',
            'content-type':'text/xml'
            }
    url = 'https://api.clarivate.com/apis/wos-starter/v1/documents'

    for batch in tqdm(PMIDs_lists):
        formatted_pmids= str(",".join(f'"{pmid}"' for pmid in batch))
        query= f'PMID=({formatted_pmids})'
#         print(query)

#         print(formatted_pmids)

        params = {
            'db': 'ZOOREC', #'WOK',#'WOS',
            'q': query,
            'limit': 50
        }



        # Make the API request
        response = requests.get(url, params=params, headers=headers)

        # Check if the request was successful (200 status code)
        if response.status_code == 200:
            # Extract the response content as JSON
            data = response.json()
#             print(data)
            # Print the DOI details
            try:
                pmids_result = data['hits']
                for pmid in pmids_result:

                    try:
                        if pmid.get('identifiers'):
                            identifiers = pmid.get('identifiers','')
                            PMID= identifiers.get('pmid','')

                            wos_covered_pmid.append(PMID) #wos_covered_doi
                    except Exception:
                        pass
            except Exception:
                pass
                wos_unresolved_pmid.append(batch)
        else:
            # If the request was not successful, print the error message
            print(f"Request failed with status code: {response.status_code} in batch: {c}")
            #print(response.text)

        time.sleep(0.20)
        #print('batch: ',c)
        c+=1 
        
    return [wos_covered_pmid,wos_unresolved_pmid]



In [None]:
"""
Break the list of DOIs to search into batches of a maximum of 50 items in each batch. 
The limit per page for Web of Science is 50 items at a time.

Input:
    cut: maximum number items in a batch
"""
wos_cut= 50

check_pmid_in_zoorec_= unionlist[unionlist['PubMedID']!='']['PubMedID'].tolist() # Check the entire Unionlist items that has PubMedID

check_pmid_in_zoorec = list(set(check_pmid_in_zoorec_))

check_pmid_in_zoorec_batches = batch_items(check_pmid_in_zoorec, wos_cut) #check_doi_in_ccc_batches

print(f'The total items (PubMedID) to search in ZOOREC is {len(check_pmid_in_zoorec_)}')
print(f'The total items (unique PubMedIDs) to search in ZOOREC is {len(check_pmid_in_zoorec)}, which are divided into {len(check_pmid_in_zoorec_batches)} batches')
print(f'The items list is divided into lists in which each list contains {wos_cut} records maximum')

In [None]:
"""
Check PMIDs Coverage in Zoological Record
"""
start = time.time()
zoorec_results= wos_zoorec_check_DOI(check_pmid_in_zoorec_batches[:])
end = time.time()
end - start

In [None]:
"""
Processing the retrieved covered items in Zoological Record Information

Filter PMIDs that have ZOOREC in the Unionlist.
"""

zoorec_coveredInDOI_tempo= zoorec_results[0]

zoorec_notin_unionlist= unionlist[unionlist['PubMedID'].isin(zoorec_coveredInDOI_tempo)]

zoorec_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/zoorec_coverednotindexed_'+date_coverage['zoorec']+'.csv')

In [None]:
set(zoorec_coveredInDOI_tempo) - set(zoorec_notin_unionlist['DOI'])

In [None]:
len(set(zoorec_coveredInDOI_tempo) - set(zoorec_notin_unionlist['DOI']))

# Crossref

In [None]:
from crossref.restful import Works, Etiquette

# Format for Etiquette
# given_etiquette = Etiquette('My Project Name', 'My Project version', 'My Project URL', 'My contact email')

my_etiquette = Etiquette('Retraction Indexing Assessment', 'version2', 'no url', my_email)
works = Works(etiquette=my_etiquette)

In [None]:
"""
Get list of DOIs to search 

Input:
    crossrefDOI_notindexed
Saved_file:
    crossref_coverednotindexed_<date>.csv
"""

check_doi_in_crossref= get_DOIs(crossrefDOI_notindexed) 
print(f'The total items to search in Crossref is {len(check_doi_in_crossref)} ')


In [None]:
"""
Check DOI Coverage in Crossref
"""

new = Works()
start = time.time()

crossref_covered_doi=[]


for i in tqdm(check_doi_in_crossref[42000:]):
    try:
        for j in new.filter(doi = i).select('DOI'):
            find = j['DOI']
            if i == find:
                crossref_covered_doi.append(i)

    except Exception:
            pass
    time.sleep(0.10)
        #print(i)

        
end = time.time()
end - start

In [None]:
"""
You can break the checking of the DOI coverage into cycles
"""
# crossref_covered_doi_tempo1= crossref_covered_doi.copy() # [:15000], changed csv filename to 1_crossref_coverednotindexed_2024-08-06
# crossref_covered_doi_tempo2= crossref_covered_doi.copy() # [15000:30000], changed csv filename to 2_crossref_coverednotindexed_2024-08-06
# crossref_covered_doi_tempo3= crossref_covered_doi.copy() # [30000:35000], changed csv filename to 3_crossref_coverednotindexed_2024-08-06
# crossref_covered_doi_tempo4= crossref_covered_doi.copy() # [35000:42000], changed csv filename to 4_crossref_coverednotindexed_2024-08-06
# crossref_covered_doi_tempo5= crossref_covered_doi.copy() # [42000:], changed csv filename to 5_crossref_coverednotindexed_2024-08-06

In [None]:
len(crossref_covered_doi_tempo1)

In [None]:
len(crossref_covered_doi_tempo2)

In [None]:
len(crossref_covered_doi_tempo3)

In [None]:
len(crossref_covered_doi_tempo4)

In [None]:
len(crossref_covered_doi_tempo5)

In [None]:
"""
Processing the retrieved covered items in Crossref
"""
# Getting the DOIs of the matched items in Crossref

crossref_coveredInDOI_tempo= crossref_covered_doi_tempo1 + crossref_covered_doi_tempo2 + crossref_covered_doi_tempo3 + crossref_covered_doi_tempo4 + crossref_covered_doi_tempo5 
crossref_coveredInDOI_tempo=  [x.lower() for x in crossref_coveredInDOI_tempo]

crossref_notin_unionlist= unionlist[unionlist['DOI'].isin(crossref_coveredInDOI_tempo) & \
          (~unionlist['source'].str.contains('Crossref'))]

crossref_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/crossref_coverednotindexed_'+date_coverage['crossref']+'.csv')

In [None]:
set(crossref_coveredInDOI_tempo) - set(crossref_notin_unionlist['DOI'])

In [None]:
len(set(crossref_coveredInDOI_tempo) - set(crossref_notin_unionlist['DOI']))

In [None]:
unionlist[unionlist['source'].str.contains('Crossref')]

# ScienceDirect

In [None]:
"""
Input:
    cut: maximum number items in a batch
"""
scidirect_cut= 50
check_doi_in_sciencedirect= get_DOIs(unionlist) # get valid DOIs

check_doi_in_sciencedirect_batches = batch_items(check_doi_in_sciencedirect, scidirect_cut)

print(f'The total items to search in ScienceDirect is {len(check_doi_in_sciencedirect)}, which are divided into {len(check_doi_in_sciencedirect_batches)} batches')
print(f'The items list is divided into lists in which each list contains {scidirect_cut} records maximum')

In [None]:
def check_DOI_ScienceDirect(dois_list:list):   
    """
    It checks DOI if it exist in a ScienceDirect
    
    :param dois_list: DOIs to check their coverage
    
    Reference:
    - https://dev.elsevier.com/documentation/ArticleMetadataAPI.wadl
    - https://dev.elsevier.com/sd_article_meta_tips.html
    """
    
    global elsevier_api_key
    global elsevier_insttoken
    store_result= []
    unresolved_doi= []
    
    api_key = elsevier_api_key
    elsevier_insttoken = elsevier_insttoken


    base_url= 'https://api.elsevier.com/content/metadata/article'
    headers = {
        'X-ELS-APIKey': api_key,
        'Accept': 'application/json',
        'X-ELS-Insttoken':elsevier_insttoken}

    formatted_dois= ' OR '.join(f'DOI({doi})' for doi in dois_list)
    params= {"query": formatted_dois,
             "start": 0,
            "count":50}  # number of result to return
    
#     print(formatted_dois)

    response = requests.get(base_url,headers=headers, params = params)
    
    if response.status_code == 200:
        
        results= response.json()
#         result= int(data['search-results']['opensearch:totalResults'])
#         data['search-results']['entry'][1] #.keys() #['link']
#         print(results)
        totalresult= int(results['search-results'].get('opensearch:totalResults',0))

        if totalresult > 0:

            try:
                for result in results['search-results']['entry']:
                    try:
                        store_result.append(result['prism:doi'])
#                         print(result['prism:doi'])

                    except KeyError:
                        pass
                    except KeyError:
                        pass
            except KeyError:
                    pass
    else:
        pass
        unresolved_doi.extend(dois_list)
        
    return [store_result,unresolved_doi]

In [None]:
# ScienceDirect DOI coverage Testing

sciencedirect_dois_testing=['10.6061/clinics/2017(05)07','10.1002/jps.21888','10.1006/bbrc.1995.1675',
                            '10.1061/(asce)0733-9399(2010)136:2(174)']
# Checking API
check_DOI_ScienceDirect(sciencedirect_dois_testing)

In [None]:
"""
Check DOI Coverage in ScienceDirect
"""

sciencedirect_covered_doi=[]
unresolved_sciencedirect_doi=[]

#check_doi_in_sciencedirect_batches[:2]
for doi_batch in tqdm(check_doi_in_sciencedirect_batches[:]):
    sciencedirect_results= check_DOI_ScienceDirect(doi_batch)
    sciencedirect_covered_doi.extend(sciencedirect_results[0])
    unresolved_sciencedirect_doi.extend(sciencedirect_results[1])
    time.sleep(0.15)

In [None]:
"""
Processing the retrieved covered items in ScienceDirect
"""

sciencedirect_coveredInDOI_tempo= sciencedirect_covered_doi

sciencedirect_coveredInDOI_tempo= [x.lower() for x in sciencedirect_coveredInDOI_tempo]

sciencedirect_notin_unionlist= unionlist[unionlist['DOI'].isin(sciencedirect_coveredInDOI_tempo)]

sciencedirect_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/sciencedirect_coverednotindexed_'+date_coverage['sciencedirect']+'.csv')

In [None]:
set(sciencedirect_coveredInDOI_tempo) - set(sciencedirect_notin_unionlist['DOI'])

# IEEE

In [None]:
def extract_metadata_ieee(results:list)->list:
    
    """
    This function extracts data from the metadata result from IEEE Xplore API
    
    :param results: results return from IEEE Xplore
    
    :return store: list of extracted metadata
    """
    store = []
    
    for metadata in results: # put in paper details
        doi =  metadata.get('doi','')
        year=  metadata.get('publication_year','')
        title= metadata.get('title','')
        au_and_affil = metadata.get('authors','')
        doc_type = metadata.get('content_type','')
        journal_title = metadata.get('publication_title','')
        date= metadata.get('publication_date','')
        index_terms= metadata.get('index_terms','')
        
        
        store.append([doi,title, year,au_and_affil, doc_type, date,
                      journal_title, index_terms])
        
    return store

In [None]:
"""
Input:
ieee_cut: indicate the numbers to items in a batch; usually is the maximum number the items the API will 
          return. 

Output:
all_result_ieee: list of records extracted from the metadata

NB. Search the DOI in batches of 100 DOIs dataset.
"""

ieee_cut = 100 # Number of items in a batch

check_doi_in_ieee =  get_DOIs(unionlist) # get valid DOIs # Check entire items with DOI


check_doi_in_ieee_batches = batch_items(check_doi_in_ieee, ieee_cut)


print(f'The total items to search in IEEE is {len(check_doi_in_ieee)}, which are divided into {len(check_doi_in_ieee_batches)} batches')
print(f'The items list is divided into batches in which each list contains {ieee_cut} records maximum')

In [None]:
def check_DOI_IEEE(ieee_dois: list):
    
    limit = 100  # Maximum number of results to retrieve per request 
    start = 0    # Starting index of the results (page)

    ieee_all_results=[]
    ieee_error_dois=[]
    
    url = "https://ieeexploreapi.ieee.org/api/v1/search/articles?"

    formatted_dois= (' OR ').join(ieee_dois)
    
    params = {
            "max_records": limit,  
            "doi": formatted_dois, 
            "apiKey": ieee_xplore_api_key}

    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        page_result = response.json()
#         print('Yes')
        print(page_result)

        if page_result['total_records']> 0:
            try:
                result= page_result['articles']
                metadata = extract_metadata_ieee(result)
                if len(metadata)>0:
                    ieee_all_results.extend(metadata)
            except Exception:
                pass
        else:
            pass
    
    else:
        print(f"Request failed with status code: {response.status_code}")
        ieee_error_dois.append(ieee_dois)

    
    return [ieee_all_results,ieee_error_dois]


In [None]:
"""
Check for 2 DOIs that are in IEEE, see what working results look like
"""
check_DOI_IEEE(['10.1093/gji/ggt223', '10.1093/imaman/dpw003'])

In [None]:
import json 
from json.decoder import JSONDecodeError

In [None]:
"""
Check DOI Coverage in IEEE
"""

ieee_covered_doi=[]
unresolved_ieee_doi=[]
error_batch_list=[]

for i in tqdm(range(799, 849)):  # change range function to choose batches that are run
    doi_batch = check_doi_in_ieee_batches[i]
    try: 
        ieee_results= check_DOI_IEEE(doi_batch)
        ieee_covered_doi.extend(ieee_results[0])
        unresolved_ieee_doi.extend(ieee_results[1])
        time.sleep(0.15)
    except JSONDecodeError:
        print(f'JSONDecodeError at batch {i}; this batch is skipped')
        error_batch_list.append(i)
        pass
   

In [None]:
# for i in tqdm(range(0,150))
# ieee_covered_doi_tempo1 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo1 = unresolved_ieee_doi.copy()
# error_batch_list_tempo1 = error_batch_list.copy()

# for i in tqdm(range(150,200))
# ieee_covered_doi_tempo2 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo2 = unresolved_ieee_doi.copy()
# error_batch_list_tempo2 = error_batch_list.copy()

# for i in tqdm(range(200,300))
# ieee_covered_doi_tempo3 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo3 = unresolved_ieee_doi.copy()
# error_batch_list_tempo3 = error_batch_list.copy()

# for i in tqdm(range(300,400))
# ieee_covered_doi_tempo4 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo4 = unresolved_ieee_doi.copy()
# error_batch_list_tempo4 = error_batch_list.copy()

# for i in tqdm(range(400,600))
# ieee_covered_doi_tempo5 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo5 = unresolved_ieee_doi.copy()
# error_batch_list_tempo5 = error_batch_list.copy()

# for i in tqdm(range(599,700))
# ieee_covered_doi_tempo6 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo6 = unresolved_ieee_doi.copy()
# error_batch_list_tempo6 = error_batch_list.copy()

# for i in tqdm(range(700,799)) 
# ieee_covered_doi_tempo7 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo7 = unresolved_ieee_doi.copy()
# error_batch_list_tempo7 = error_batch_list.copy()

# for i in tqdm(range(799,849))
# ieee_covered_doi_tempo8 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo8 = unresolved_ieee_doi.copy()
# error_batch_list_tempo8 = error_batch_list.copy()

Viewing error and unresolved results of each batch

In [None]:
unresolved_ieee_doi_tempo8

In [None]:
error_batch_list_tempo8

In [None]:
unresolved_ieee_doi_tempo7

In [None]:
error_batch_list_tempo7

In [None]:
unresolved_ieee_doi_tempo6 # Batch index 684. Include with unit testing.

In [None]:
error_batch_list_tempo6

In [None]:
unresolved_ieee_doi_tempo5 # Batch index 599, failed due to API 200-request limit. Reran with batch 6. Can also unit test.

In [None]:
error_batch_list_tempo5

In [None]:
unresolved_ieee_doi_tempo4

In [None]:
error_batch_list_tempo4

In [None]:
unresolved_ieee_doi_tempo3

In [None]:
error_batch_list_tempo3

In [None]:
unresolved_ieee_doi_tempo2

In [None]:
error_batch_list_tempo2

In [None]:
unresolved_ieee_doi_tempo1

In [None]:
error_batch_list_tempo1

In [None]:
"""
Sample of JSONDecodeError
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:971, in Response.json(self, **kwargs)
    970 try:
--> 971     return complexjson.loads(self.text, **kwargs)
    972 except JSONDecodeError as e:
    973     # Catch JSON-related errors and raise as requests.JSONDecodeError
    974     # This aliases json.JSONDecodeError and simplejson.JSONDecodeError

File C:\ProgramData\anaconda3\Lib\json\__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:

File C:\ProgramData\anaconda3\Lib\json\decoder.py:337, in JSONDecoder.decode(self, s, _w)
    333 "Return the Python representation of ``s`` (a ``str`` instance
    334 containing a JSON document).
    335 
    336 "
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338 end = _w(s, end).end()

File C:\ProgramData\anaconda3\Lib\json\decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

JSONDecodeError                           Traceback (most recent call last)
Cell In[88], line 10
      8 #check_doi_in_ieee_batches[:2]
      9 for doi_batch in tqdm(check_doi_in_ieee_batches[:200]): # 
---> 10     ieee_results= check_DOI_IEEE(doi_batch)
     11     ieee_covered_doi.extend(ieee_results[0])
     12     unresolved_ieee_doi.extend(ieee_results[1])

Cell In[82], line 23, in check_DOI_IEEE(ieee_dois)
     20     response = requests.get(url, params=params)
     22     if response.status_code == 200:
---> 23         page_result = response.json()
     24 #         print('Yes')
     25 #         print(page_result)
     27         if page_result['total_records']> 0:

File C:\ProgramData\anaconda3\Lib\site-packages\requests\models.py:975, in Response.json(self, **kwargs)
    971     return complexjson.loads(self.text, **kwargs)
    972 except JSONDecodeError as e:
    973     # Catch JSON-related errors and raise as requests.JSONDecodeError
    974     # This aliases json.JSONDecodeError and simplejson.JSONDecodeError
--> 975     raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

"""
pass

In [None]:
"""
Combine batches of covered DOIs
"""
ieee_covered_doi_regular_batches = ieee_covered_doi_tempo8 + ieee_covered_doi_tempo7 + ieee_covered_doi_tempo6 + \ 
ieee_covered_doi_tempo5 + ieee_covered_doi_tempo4 + ieee_covered_doi_tempo3 + ieee_covered_doi_tempo2 + ieee_covered_doi_tempo1

len(ieee_covered_doi_regular_batches)

In [None]:
"""
Combine error doi batches to complete unit testing
"""
error_batch_list = error_batch_list_tempo1 + error_batch_list_tempo2 + error_batch_list_tempo3 + error_batch_list_tempo4 \
+ error_batch_list_tempo5 + error_batch_list_tempo6 + error_batch_list_tempo7 + error_batch_list_tempo8

# Add unresolved DOI batches for unit testing purposes
error_batch_list.append(599)
error_batch_list.append(684)

error_batch_list
 

In [None]:
"""
Test error doi batches separately. This tests each DOI within the batch individually and should only be run if 
you have been provided with increased access above the usual 200 requests/day by IEEE.
"""
ieee_covered_doi=[]
unresolved_ieee_doi=[]

for doi_batch in tqdm(check_doi_in_ieee_batches[684]): 
    ieee_results= check_DOI_IEEE(doi_batch)
    ieee_covered_doi.extend(ieee_results[0])
    unresolved_ieee_doi.extend(ieee_results[1])
    time.sleep(0.15)

In [None]:
# Batch 47
# ieee_covered_doi_tempo_47 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_47 = unresolved_ieee_doi.copy()

# Batch 55
# ieee_covered_doi_tempo_55 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_55 = unresolved_ieee_doi.copy()

# Batch 61
# ieee_covered_doi_tempo_61 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_61 = unresolved_ieee_doi.copy()

# Batch 111
# ieee_covered_doi_tempo_111 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_111 = unresolved_ieee_doi.copy()

# Batch 114
# ieee_covered_doi_tempo_114 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_114 = unresolved_ieee_doi.copy()

#  Batch 140
# ieee_covered_doi_tempo_140 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_140 = unresolved_ieee_doi.copy()

# Batch 153
# ieee_covered_doi_tempo_153 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_153 = unresolved_ieee_doi.copy()

# Batch 329
# ieee_covered_doi_tempo_329 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_329 = unresolved_ieee_doi.copy()

# Batch 339
# ieee_covered_doi_tempo_339 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_339 = unresolved_ieee_doi.copy()

# Batch 372
# ieee_covered_doi_tempo_372 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_372 = unresolved_ieee_doi.copy()

# Batch 429
# ieee_covered_doi_tempo_429 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_429 = unresolved_ieee_doi.copy()

# Batch 451
# ieee_covered_doi_tempo_451 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_451 = unresolved_ieee_doi.copy()

# Batch 745
# ieee_covered_doi_tempo_745 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_745 = unresolved_ieee_doi.copy()

# Batch 767
# ieee_covered_doi_tempo_767 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_767 = unresolved_ieee_doi.copy()

# Batch 769
# ieee_covered_doi_tempo_769 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_769= unresolved_ieee_doi.copy()

# Batch 599
# ieee_covered_doi_tempo_599 = ieee_covered_doi.copy()
# unresolved_ieee_doi_tempo_599 = unresolved_ieee_doi.copy()

# Batch 684
ieee_covered_doi_tempo_684 = ieee_covered_doi.copy()
unresolved_ieee_doi_tempo_684 = unresolved_ieee_doi.copy()

In [None]:
"""
Combine error batch covered DOIs
"""
error_batch_covered_doi = ieee_covered_doi_tempo_47 + ieee_covered_doi_tempo_55 + ieee_covered_doi_tempo_61 + \
ieee_covered_doi_tempo_111 + ieee_covered_doi_tempo_114 + ieee_covered_doi_tempo_140 + ieee_covered_doi_tempo_153 + \
ieee_covered_doi_tempo_329 + ieee_covered_doi_tempo_339 + ieee_covered_doi_tempo_372 + ieee_covered_doi_tempo_429 + \
ieee_covered_doi_tempo_451 + ieee_covered_doi_tempo_745 + ieee_covered_doi_tempo_767 + ieee_covered_doi_tempo_769 + \
ieee_covered_doi_tempo_599 + ieee_covered_doi_tempo_684

error_batch_covered_doi

In [None]:
"""
Combine regular batching results with error batch results
"""
ieee_covered_doi_all = ieee_covered_doi_regular_batches + error_batch_covered_doi 
len(ieee_covered_doi_all)

In [None]:
"""
Processing the retrieved covered items in IEEE. 
"""
# Getting the DOIs of the covered items
ieee_coveredInDOI = np.array(ieee_covered_doi_all)[:,0]

ieee_coveredInDOI= [x.lower() for x in ieee_coveredInDOI ]

ieee_notin_unionlist = unionlist[unionlist['DOI'].isin(ieee_coveredInDOI)]

ieee_notin_unionlist \
#    .to_csv(data_dir+'coverednotindexed/ieee_coverednotindexed_'+date_coverage['ieee']+'.csv')

In [None]:
len(set(ieee_coveredInDOI) - set(ieee_notin_unionlist['DOI']))

## Astrophysics Data System (ADS)

In [None]:
def format_dois_ads(dois_list: list)->str:
    """
    It formats the dois into standard input format for Astrophysics Data System (ADS) processing i.e. 
    "doi:10.1016/j.optlastec.2023.109186 OR doi:10.1016/j.jqsrt.2023.108735"
    
    'dois_list:' list of DOIs
    'return': Format that ADS can use for search
    """
    
    formatted_dois= " OR ".join(f"doi:{doi}" for doi in dois_list)
    
    return formatted_dois
    

In [None]:
def get_ADS_metadata(publications: list) -> list:
    """
    It extracts data from the ADS API results
    
    :param publications: JSON file of publications - the result from the ADS API search
    :return: List of extracted metadata of publications
    """
    results = []

    try:
        for pub in publications:
            authors = pub.get('author_norm', '')
            doi = pub.get('doi', [''])[0]
            title = pub.get('title', [''])[0]
            journal = pub.get('pub', '')
            year = pub.get('year', '')

            results.append([doi, authors, title, year, journal])

    except Exception as e:
        print(f"Error in get_ADS_metadata: {e}")
        pass
        # You can log the error or handle it as needed

    return results

In [None]:
def ADS_search_DOIs(dois_list: list)->list:  
    """
    It searches DOIs from ADS and uses 
    i. 'format_dois_ads()' to process the DOIs into useable format
    ii. 'get_ADS_metadata()' to extract data from the metadata of founded items
    
    dois_list: list of DOIs to search via ADS API
    
    
    :return: list of records found
    
    
    Resources:
    ---------
    https://github.com/adsabs/adsabs-dev-api#access-settings
    [5k limit]: https://github.com/adsabs/adsabs-dev-api/tree/master#access 
    https://ui.adsabs.harvard.edu/help/api/api-docs.html#get-/search/query

    """
#     global ads_catch_errors
    ads_catched_errors=[]
    
    url = 'https://api.adsabs.harvard.edu/v1/search/query'
    headers = {'Authorization':f'Bearer:{ads_api_key}'} 
    params = {'q': format_dois_ads(dois_list), 
              'rows':200, # number of result on a page. Can take upto 1000
              'fl': 'doi,title, author_norm, pubdate,pub, year'} #bibcode, keyword_norm, keyword_schema 

    response = requests.get(url, headers=headers, params=urlencode(params)) # encode the parameter
    
    
    results=[]
    
    if response.status_code == 200:
        data=response.json()
#         print(data)
        
        nfound= data['response']['numFound']
        publications= data['response']['docs']
        
        results= get_ADS_metadata(publications)
        
    else:
        ads_catched_errors.append(dois_list)
        print(f"Error: {response.status_code}, {response.text}") 
    
    return results, ads_catched_errors    

In [None]:
"""
Input:
cut_ads: indicate the numbers to items in a batch; usually is the maximum number the items the API will 
          return. 
ads_doi_to_search: the list of DOIs to search/check coverage via the ADS API

Output:
all_result_ads: list of records extracted from the metadata

NB. Search the DOI in batches of 50 DOIs dataset. Though it can contain up to 5k items on its result page, 
it will sometimes error and say the request is too large.
"""

check_doi_in_ads=  get_DOIs(unionlist) # Check the entire unionlist valid DOIs

ads_cut = 50 # Number of items in a batch

doi_batch_ads = batch_items(check_doi_in_ads, ads_cut) # divides dois into batches

print(f'The total items to search in ADS is {len(check_doi_in_ads)}, which are divided into {len(doi_batch_ads)} batches')
print(f'The items list is divided into lists in which each list contains {ads_cut} records maximum')

In [None]:
"""
Checking coverage of DOIs of the Unionlist via ADS API. Catch error batches for individual processing.
"""

ads_catch_errors_round_1=[]
ads_all_results_round_1= []
start = time.time()


for batch in tqdm(doi_batch_ads):
    #print(doi_batch_ads[batch])
    results, errors = ADS_search_DOIs(batch)
    
    ads_all_results_round_1.extend(results)
    ads_catch_errors_round_1.extend(errors)

end = time.time()
end - start  

In [None]:
"""
Make flat list of DOIs from batches that had syntax error
"""
flat_list_ads_catch_errors = []
flat_list_ads_catch_errors = [item for sublist in ads_catch_errors_round_1 for item in sublist]
flat_list_ads_catch_errors

In [None]:
len(flat_list_ads_catch_errors)

In [None]:
def ADS_search_single_DOI(doi):
    """
    Search for single DOI using ADS API.
    param: doi: string DOI to search
    """
    url = 'https://api.adsabs.harvard.edu/v1/search/query'
    headers = {'Authorization':f'Bearer:{ads_api_key}'} 
    params = {'q': f"doi:{doi}", 
              'rows':200, # number of result on a page. Can take up to 1000
              'fl': 'doi,title, author_norm, pubdate,pub, year'} #bibcode, keyword_norm, keyword_schema 

    response = requests.get(url, headers=headers, params=urlencode(params)) # encode the parameter
    
    
    results=[]
    ads_catched_errors=[]
    
    if response.status_code == 200:
        data=response.json()
#         print(data)
        
        nfound= data['response']['numFound']
        publications= data['response']['docs']
        
        results= get_ADS_metadata(publications)
        
    else:
        ads_catched_errors.append(doi)
        print(f"Error: {response.status_code}, {response.text}") 
    
    return results, ads_catched_errors

In [None]:
"""
Small scale test of item that is confirmed in ADS.
"""
doi_test= '10.1016/j.dark.2024.101586'
result_test, error_test = ADS_search_single_DOI(doi_test)
print(result_test)
print(error_test)

In [None]:
"""
Use API to search for DOIs from error batches individually
"""
ads_catch_errors_round_2=[]
ads_all_results_round_2= []
start = time.time()


for doi in tqdm(flat_list_ads_catch_errors):
    results, errors= ADS_search_single_DOI(doi)
    
    ads_all_results_round_2.extend(results)
    ads_catch_errors_round_2.extend(errors)

end = time.time()
end - start  

In [None]:
len(ads_all_results_round_1)

In [None]:
len(ads_all_results_round_2)

In [None]:
"""
Combine results lists from round 1 and round 2
"""
ads_all_results = ads_all_results_round_1 + ads_all_results_round_2

In [None]:
len(ads_all_results)

In [None]:
"""
Processing the retrieved covered items in ADS.
"""
# Getting the DOIs of the covered items
ads_coveredInDOI = np.array(ads_all_results, dtype=object)[:,0]

ads_coveredInDOI= [x.lower() for x in ads_coveredInDOI]

# Filtering the DOIs that are covered but not indexed as retracted in the source 
ads_notin_unionlist = unionlist[unionlist['DOI'].isin(ads_coveredInDOI)]

ads_notin_unionlist \
#     .to_csv(data_dir+'coverednotindexed/ads_coverednotindexed_'+date_coverage['ads']+'.csv')

In [None]:
set(ads_coveredInDOI) - set(ads_notin_unionlist['DOI'])

In [None]:
"""
DOIs parsing error
10.1002/1096-9098(200007)74:3201::AID-JSO83.0.CO;2-5  #(worked when checked on DOI.org)
10.1002/1098-1136(200012)32:3<247::AID-GLIA50>3.0.CO;2-T #(worked when checked on DOI.org)
10.1002/(sici)1097-0215(19980330)76:1<154::aid-ijc24>3.0.co;2-b  #(worked when checked on DOI.org)
10.3758/s13423-018-1505-y # worked when checked on DOI.org)

DOI encoding error: 
# '10.1002/1521-396X(200207)192:1<212::AID-PSSA212>3.0.CO;2-B' (WoS: worked with doi.org)
#'10.1002/1521-396X(200207)192:1212::AID-PSSA2123.0.CO;2-B' (RW: not worked with doi.org)

Data Entry Error: This is a generic term for any mistake made during the manual entry of data. 
It can include typos, incorrect values, or entering data in the wrong format. e.g. 10.7705/biomedica.v38i0.3546)
#10.3758/s13423-017-1376-7
"""
