# Bibliography Search 

**Author:** Jack Galbraith-Edge

In [2]:
# Import notebook setup
from msc_code.scripts.notebook_setup import *

In [3]:
# Import Title and Abstract Screening Results
title_abstract_screened = pd.read_csv("/".join([PROC_DATA_DIR, "title_abstract_review", "title_abstract_review_complete.csv"]))

title_abstract_screened['TA_Exclude_FINAL'] = title_abstract_screened.apply(
    lambda row: row['Exclude_JGE']
    if pd.isna(row['Exclude_MS'])  # If MS is NaN, take JGE's decision
    else (row['Exclude_JGE']
          if row['Exclude_JGE'] == row['Exclude_MS']  # If JGE and MS agree
          else row['Exclude_GC']),  # If JGE and MS disagree, take GC's decision
    axis=1
)

title_abstract_screened = title_abstract_screened.drop('Unnamed: 0', axis=1)

title_abstract_screened = title_abstract_screened.rename(columns={
    'Exclude_JGE': 'TA_Exclude_JGE',
    'Reason ID_JGE': 'TA_Reason ID_JGE',
    'Exclude_MS': 'TA_Exclude_MS',
    'Reason ID_MS': 'TA_Reason ID_MS',
    'Exclude_GC': 'TA_Exclude_GC',
    'Reason ID_GC': 'TA_Reason ID_GC', 
    'Review_GC': 'TA_Review_GC',
    'Comments_GC': 'TA_Comments_GC'
})

jge_full_text_screened = pd.read_csv(
    "/".join([RAW_DATA_DIR, "full_text_screen", "full_text_screen_end_jge.csv"])
)

jge_full_text_screened = jge_full_text_screened.rename(columns={
    'Exclude': 'FT_Exclude_JGE',
    'Reason ID': 'FT_Reason ID_JGE',
    'Paediatric': 'FT_Paediatric_JGE',
    'Intention Reported': 'FT_Intention Reported_JGE',
    'Deliberate intention': 'FT_Deliberate Intention_JGE',
    'Unclear': 'FT_Unclear_JGE',
    'Accessed': 'FT_Accessed_JGE',
    'Comments': 'FT_Comments_JGE'
})

# Select certain columns from results that have undergone fulltext screening by JGE.
jge_full_text_screened = jge_full_text_screened[['id', 'FT_Exclude_JGE', 'FT_Reason ID_JGE', 'FT_Paediatric_JGE',
       'FT_Intention Reported_JGE', 'FT_Deliberate Intention_JGE',
       'FT_Unclear_JGE', 'FT_Accessed_JGE', 'FT_Comments_JGE']]

# Merge dataframes
screening_results = pd.merge(
    title_abstract_screened,
    jge_full_text_screened,
    how='left', 
    on='id'
)

# Create dataframe of all results that have been excluded so far
excluded_results = screening_results[(screening_results['TA_Exclude_FINAL'] == True) | (screening_results['FT_Exclude_JGE'] == True)]

# Export to CSV
excluded_results.to_csv("/".join([PROC_DATA_DIR, 'bibliography_search', 'excluded_results_jge_reviewed.csv']), index=False)

# Export results of all screening so far
screening_results.to_csv("/".join([PROC_DATA_DIR, "bibliography_search", "ft_results_jge_historical.csv"]), index=False)

In [4]:
# Import Zotero Data on included results.
jge_zotero_included = pd.read_csv("/".join([RAW_DATA_DIR, "full_text_screen", "full_text_zotero_included_jge.csv"]))

# Select 'Publication Year', 'Title', 'Publication Title' and 'DOI' columns from Zotero Data.
jge_zotero_included = jge_zotero_included[['Publication Year', 'Title', 'Publication Title', 'DOI']]

jge_screened_df = pd.read_csv(
    "/".join([RAW_DATA_DIR, "full_text_screen", "full_text_screen_end_jge.csv"])
)

jge_included = jge_screened_df[jge_screened_df['Exclude'] == False]

# Merge Zotero dataframe with Excel Reviewed CSV to add DOI's to Excel data.
jge_included = pd.merge(
    jge_included,
    jge_zotero_included,
    how='left',
    on=['Publication Year', 'Title', 'Publication Title']
)

# Create dataframe of results where there is no DOI.
jge_included_no_doi = jge_included[jge_included['DOI'].isna()]

# Count number of results with no DOI
jge_included_no_doi_count = len(jge_included_no_doi)

# Output number of results with no DOI
print(f"There are {jge_included_no_doi_count} results without DOI's recorded.")

jge_included['id'] = jge_included['id'].astype(int)  # Ensure id column is an integer

# Search internet for missing DOIs manually using browser. Update them here manually.
jge_included.loc[jge_included['id'] == 428, "DOI"] = "10.52916/jmrs244144"
jge_included.loc[jge_included['id'] == 484, "DOI"] = "10.14744/less.2023.45403"

# Create dataframe where DOI is NaN (blank)
jge_included_no_doi = jge_included[jge_included['DOI'].isna()]

# Count incidence where DOI is NaN
jge_included_no_doi_count = len(jge_included_no_doi)

# Output DOI NaN count
print(f"There are now {jge_included_no_doi_count} results without DOI's recorded.")

# Create list of included DOIs
jge_included_doi_list = jge_included['DOI'].tolist()

# Lists to store valid and invalid DOIs
valid_dois = {}
invalid_dois = []

# Check each DOI is valid
for doi in jge_included_doi_list:
    is_valid, title = check_doi_valid(doi)
    if is_valid:
        valid_dois[doi] = title
    else:
        invalid_dois.append(doi)
    # Pause to be polite to the API and avoid rate-limiting
    time.sleep(1)

# Print results
print("Valid DOIs and their titles:")
for doi, title in valid_dois.items():
    print(f"{doi}: {title}")

print("\nInvalid DOIs:")
for doi in invalid_dois:
    print(doi)

# Write list of valud DOIs to CSV for reference later
with open("/".join([PROC_DATA_DIR, 'bibliography_search', 'jge_included_valid_dois.csv']), 'w', newline='') as f:
    writer = csv.writer(f)
    for item in valid_dois:
        writer.writerow([item])  # Writing all items in one row

# Create a new column 'valid DOI' based on whether the DOI is in the valid_dois list
jge_included['valid DOI'] = jge_included['DOI'].apply(lambda x: x in valid_dois)

There are 3 results without DOI's recorded.
There are now 1 results without DOI's recorded.
DOI nan returned status code: 429
DOI 10.1097/00019509-199802000-00016 returned status code: 404
DOI 10.12809/eaapl81732 returned status code: 404
Valid DOIs and their titles:
10.1056/NEJM188612161152403: A Case of Gastrotomy. Digital Exploration of Œsophagus, and Removal of Plate of Teeth; Recovery
10.1007/bf02943095: Deliberate ingestion of foreign bodies by institutionalised psychiatric hospital patients and prison inmates
10.1001/archsurg.1996.01430140056015: Gastrointestinal 'crosses'. A new shade from an old palette.
10.1136/emj.14.1.54: Oesophageal "cross"--a sinister foreign body.
10.1007/s003830050492: A bizarre bezoar: case report and review of the literature
10.1136/gut.2003.024810: Management of swallowed razor blades—retrieve or wait and see?
10.1016/j.jemermed.2004.03.013: A penny for your thoughts: small bowel obstruction secondary to coin ingestion.
10.4314/wajm.v25i3.28286: Fore

In [5]:
cr = Crossref()

def get_references(doi):
    """Fetch references (backward citation) for a given DOI."""
    try:
        result = cr.works(ids=doi)
        return result["message"].get("reference", [])
    except Exception as e:
        print(f"Error retrieving {doi}: {e}")
        return []

reference_list = []   # To accumulate reference dictionaries
no_reference_list = []  # To store DOIs that returned no references or errors

# Loop through each DOI in jge_included_doi_list
for doi in jge_included_doi_list:
    try:
        refs = get_references(doi)
        if refs:
            for ref in refs:
                # Extract the fields you need
                extracted_ref = {
                    "DOI": ref.get('DOI', 'No DOI provided'),
                    "author": ref.get('author', 'No Author provided'),
                    "year": ref.get('year', 'No Year provided'),
                    "journal-title": ref.get('journal-title', 'Blank'),
                    "first-page": ref.get('first-page', 'Blank'),
                    "volume": ref.get('volume', 'Blank'),
                    "unstructured": ref.get('unstructured', 'Blank')
                }
                reference_list.append(extracted_ref)
        else:
            no_reference_list.append(doi)
    except MemoryError:
        print(f"Memory error encountered while retrieving DOI: {doi}")
        no_reference_list.append(doi)
    except Exception as e:
        print(f"Error retrieving {doi}: {e}")
        no_reference_list.append(doi)
    
    # Pause briefly to reduce memory load and avoid overwhelming the API
    time.sleep(2)
    # Force garbage collection
    gc.collect()

# After processing, convert your reference list to a DataFrame
refs_df = pd.DataFrame(reference_list)

# Export to CSV.
refs_df.to_csv("/".join([RAW_DATA_DIR, 'bibliography_search', 'bibliography_search.csv']))

Error retrieving nan: object of type 'float' has no len()
Error retrieving 10.12809/eaapl81732: Client error '404 Not Found' for url 'https://api.crossref.org/works/10.12809/eaapl81732'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404


In [6]:
def enrich_doi_data(df, doi_column="DOI"):
    # OpenAlex API Base URL
    base_url = "https://api.openalex.org/works/"
    data = []

    for doi in df[doi_column]:
        response = requests.get(base_url + "https://doi.org/" + doi)

        if response.status_code == 200:
            json_data = response.json()
            print(json_data)
            data.append({
                    "DOI": doi,
                    "Title": json_data.get("title", "N/A"),
                    "Authors": ", ".join([auth["author"]["display_name"] for auth in json_data.get("authorships", [])]) if "authorships" in json_data else "N/A",
                    "Journal": json_data.get("host_venue", {}).get("display_name", "N/A"),
                    "Publication Year": json_data.get("publication_year", "N/A"),
                    "Publisher": json_data.get("primary_location", {}).get("source", {}).get("publisher", "N/A"),
                    "Citations Count": json_data.get("cited_by_count", 0),
                    "Abstract": json_data.get("abstract", "N/A"),
                    "URL": json_data.get("id", "N/A")
                })
        else:
            data.append({
                "DOI": doi,
                "Title": "Not Found",
                "Authors": "N/A",
                "Journal": "N/A",
                "Publication Year": "N/A",
                "Publisher": "N/A",
                "Citations Count": "N/A",
                "Abstract": "N/A",
                "URL": "N/A"
            })

        time.sleep(1)  # Avoid API rate limits

    # Convert to DataFrame
    metadata_df = pd.DataFrame(data)

    # Merge with the original DataFrame
    df = df.merge(metadata_df, on="DOI", how="left")

    return df

refs_df = enrich_doi_data(refs_df)

{'id': 'https://openalex.org/W2157325130', 'doi': 'https://doi.org/10.1542/peds.89.4.747', 'title': 'Ingestion of Cylindrical and Button Batteries: An Analysis of 2382 Cases', 'display_name': 'Ingestion of Cylindrical and Button Batteries: An Analysis of 2382 Cases', 'publication_year': 1992, 'publication_date': '1992-04-01', 'ids': {'openalex': 'https://openalex.org/W2157325130', 'doi': 'https://doi.org/10.1542/peds.89.4.747', 'mag': '2157325130', 'pmid': 'https://pubmed.ncbi.nlm.nih.gov/1557273'}, 'language': 'en', 'primary_location': {'is_oa': False, 'landing_page_url': 'https://doi.org/10.1542/peds.89.4.747', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S77494981', 'display_name': 'PEDIATRICS', 'issn_l': '0031-4005', 'issn': ['0031-4005', '1098-4275'], 'is_oa': False, 'is_in_doaj': False, 'is_indexed_in_scopus': True, 'is_core': True, 'host_organization': 'https://openalex.org/P4310315734', 'host_organization_name': 'American Academy of Pediatrics', 'host_organization_li

AttributeError: 'NoneType' object has no attribute 'get'