In [719]:
# Bibliography Search

In [720]:
# Setup notebook
from msc_code.scripts.notebook_setup import *

In [721]:
input_path = os.path.join(RAW_DATA_DIR, 'bibliography_search')
output_path = os.path.join(PROC_DATA_DIR, 'bibliography_search')

In [722]:
# import included studies
import_path = os.path.join(PROC_DATA_DIR, 'full_text_screen', 'full_text_screen_end_final.csv')
ft_data = pd.read_csv(import_path)
included_ids = list(ft_data[ft_data['Exclude_FINAL'] == False]['Study_ID'])

In [723]:
# Create dataframe for bibliography search
bib_search_df = pd.DataFrame(columns=['Study_ID', 'Bib_Search_Complete'])

bib_search_df['Study_ID'] = included_ids

file_path = os.path.join(output_path, 'bib_search_start.csv')
bib_search_df.to_csv(file_path, index=False)

In [724]:
# Reimport data from bibliography search
file_path = os.path.join(input_path, 'bib_search_end.xlsx')
bib_search_df = pd.read_excel(file_path)

In [725]:
# Filter out excluded papers
bib_search_df = bib_search_df[bib_search_df['Study_ID'].isin(included_ids)]

In [726]:
bib_search_df['Bib_Search_Complete'] = bib_search_df['Bib_Search_Complete'].astype(str)

status_map = {
    'nan': 'Awaiting Search',
    'Y': 'Search Complete',
    'N': 'For second review'
}

bib_search_df['Bib_Search_Complete'] = bib_search_df['Bib_Search_Complete'].map(status_map)

In [727]:
bib_search_df['Bib_Search_Complete'].value_counts()

Bib_Search_Complete
Search Complete    96
Name: count, dtype: int64

In [728]:
# Import bibliography search results
import_path = os.path.join(input_path, 'bib_search_results.csv')
bib_search_df = pd.read_csv(import_path)

# Sort Results by Publication Year, Author, Title ascending.
bib_search_df = bib_search_df.sort_values(by=['Publication Year', 'Author', 'Title'], ascending=True).reset_index(drop=True)

# Import initial results to
import_path = os.path.join(PROC_DATA_DIR, "title_abstract_review", "all_results_title_abstract_start.csv")
ta_start = pd.read_csv(import_path)

# Rename id column
ta_start = ta_start.rename(columns={
    'id': 'Study_ID'
})

# Find max study ID
starting_index = ta_start['Study_ID'].max() + 1

# Create Study_ID for items in bibliography search
bib_search_df['Study_ID'] = range(starting_index, starting_index + len(bib_search_df))

# Export to CSV
file_path = os.path.join(output_path, 'bib_search_results.csv')
bib_search_df.to_csv(file_path)

In [729]:
import os
import pandas as pd

# --- Load title/abstract review ---
ta_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'title_abstract_review_FINAL.csv')
ta_review = pd.read_csv(ta_path)

# --- Load full-text screen ---
ft_path = os.path.join(PROC_DATA_DIR, 'full_text_screen', 'full_text_screen_end_final.csv')
ft_review = pd.read_csv(ft_path)

# --- Normalize helper ---
def clean_text(s):
    return str(s).strip().lower() if pd.notnull(s) else ''

# --- Normalize Titles and DOIs ---
bib_search_df['Title_clean'] = bib_search_df['Title'].apply(clean_text)
bib_search_df['DOI_clean'] = bib_search_df['DOI'].apply(clean_text)

ta_review['Title_clean'] = ta_review['Title'].apply(clean_text)
ta_review['DOI_clean'] = ta_review['DOI'].apply(clean_text)

# --- Step 1: Original count ---
print(f'{len(bib_search_df)} items in bib_search_results before duplicate removal')

# --- Step 2: Find duplicates
# Match by DOI (if DOI exists and isn't blank)
doi_matches = bib_search_df[
    (bib_search_df['DOI_clean'] != '') &
    bib_search_df['DOI_clean'].isin(
        ta_review['DOI_clean'].dropna().loc[lambda x: x != '']
    )
]

# Match by Title only where DOI is missing
title_matches = bib_search_df[
    (bib_search_df['DOI_clean'] == '') &
    bib_search_df['Title_clean'].isin(
        ta_review['Title_clean'].dropna().str.strip().str.lower()
    )
]

duplicates = pd.concat([doi_matches, title_matches]).drop_duplicates()

# --- Step 3: Remove duplicates from bib_search_df ---
duplicate_ids = duplicates['Study_ID'].tolist()
bib_search_df = bib_search_df[~bib_search_df['Study_ID'].isin(duplicate_ids)]

print(f'{len(bib_search_df)} items in bib_search_results after duplicate removal')

# --- Step 4: Prepare full-text exclusions ---
ta_review = ta_review.rename(columns={'id': 'Study_ID'})  # if needed

ft_review = pd.merge(
    ft_review,
    ta_review[['Study_ID', 'DOI_clean']],
    how='left',
    on='Study_ID'
)

# Mark excluded full-text articles
ft_excluded = ft_review[ft_review['Exclude_FINAL'] == True]

# --- Step 5: Exclude based on clean DOI ---
ft_excluded_dois = ft_excluded['DOI_clean'].dropna().loc[lambda x: x != ''].unique()

exclusions = bib_search_df[
    (bib_search_df['DOI_clean'] != '') &
    bib_search_df['DOI_clean'].isin(ft_excluded_dois)
]

print(f"{len(bib_search_df)} items in bib_search_results before full-text exclusions")
bib_search_df = bib_search_df[~bib_search_df['Study_ID'].isin(exclusions['Study_ID'])]
print(f"{len(bib_search_df)} items in bib_search_results after full-text exclusions")

# --- Step 6: Combine pre-screen exclusions ---
pre_screen_exclusions = pd.concat([duplicates, exclusions]).drop_duplicates()

# --- Step 7: Export results ---
# Debug output to audit dropped papers
duplicates_out = os.path.join(output_path, 'debug_duplicates_found.csv')
duplicates.to_csv(duplicates_out, index=False)

# Final pre-screen exclusion list
export_path = os.path.join(output_path, 'pre_screen_exclusions.csv')
pre_screen_exclusions.to_csv(export_path, index=False)

print("Pre-screen exclusions exported.")

204 items in bib_search_results before duplicate removal
192 items in bib_search_results after duplicate removal
192 items in bib_search_results before full-text exclusions
192 items in bib_search_results after full-text exclusions
Pre-screen exclusions exported.


In [730]:
# Rename to match screening format
bib_search_df = bib_search_df.rename(columns={
    'Access Date': 'Accessed'
})

# Define the screening columns
screening_columns = [
    'Study_ID', 'Publication Year', 'Author', 'Title',
    'Publication Title', 'Database', 'Exclude', 'Reason ID',
    'Paediatric', 'Intention Reported', 'Deliberate intention',
    'Unclear', 'Accessed', 'Comments'
]

# Create empty screening DataFrame with same index as bib_search_df
bib_search_screening = pd.DataFrame(index=bib_search_df.index, columns=screening_columns)

# Fill in matching columns from bib_search_df
for col in screening_columns:
    if col in bib_search_df.columns:
        bib_search_screening[col] = bib_search_df[col]

# Set the Database source
bib_search_screening['Database'] = "Bibliography Search"

# Safely convert numeric fields with missing values allowed
bib_search_screening['Publication Year'] = pd.to_numeric(
    bib_search_screening['Publication Year'], errors='coerce'
).astype('Int64')

bib_search_screening['Study_ID'] = pd.to_numeric(
    bib_search_screening['Study_ID'], errors='coerce'
).astype('Int64')

# Done — your DataFrame is now ready
bib_search_screening

Unnamed: 0,Study_ID,Publication Year,Author,Title,Publication Title,Database,Exclude,Reason ID,Paediatric,Intention Reported,Deliberate intention,Unclear,Accessed,Comments
0,492,1941,"Macmanus, Joseph E.",Perforations of the intestine by ingested fore...,The American Journal of Surgery,Bibliography Search,,,,,,,2025-04-14 17:11:45,
1,493,1962,"Perelman, H.",Toothpick perforations of the gastrointestinal...,The Journal of Abdominal Surgery,Bibliography Search,,,,,,,,
2,494,1967,"Sloop, R. D.; Thompson, J. C.",Aorto-esophageal fistula: report of a case and...,Gastroenterology,Bibliography Search,,,,,,,,
3,495,1969,"Johnson, Wilbur E.",On Ingestion of Razor Blades,JAMA,Bibliography Search,,,,,,,2025-04-14 14:35:26,
4,496,1969,"Schechter, D. C.; Gilbert, L.",Injuries of the heart and great vessels due to...,Thorax,Bibliography Search,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,691,,"Song, Young S; Covarrubias, Diego A; Nardi, Pe...",Foreign Body Appendicitis,,Bibliography Search,,,,,,,,
200,692,,,Endoscopic Retrieval of an Intentionally Inges...,,Bibliography Search,,,,,,,2025-04-14 13:57:27,
201,693,,,Escalating Ingestion of Razor Blades in a Pati...,Psychiatrist.com,Bibliography Search,,,,,,,2025-04-14 14:34:37,
202,694,,,"Frequent Deliberate Self-Harm: Repetition, Sui...",,Bibliography Search,,,,,,,2025-03-12 15:31:06,


In [731]:
# Export to CSV
export_path = os.path.join(output_path, 'bib_search_screen_start.csv')
bib_search_screening.to_csv(export_path, index=False)

In [732]:
exclusion_criteria

{1: 'Full text not available in English.',
 2: 'Studies not focusing on intentional self-ingestion (into the gastrointestinal tract) of foreign object via the oral cavity (mouth) or where unclear if ingested.',
 3: 'Studies focussing solely on accidental ingestion.',
 4: 'Non-Human/ animal studies.',
 5: 'Reviews, editorials, commentaries, and opinion pieces without original empirical data.',
 6: 'Duplicate publications or studies with overlapping data sets (the most comprehensive or recent study will be included).',
 7: 'Studies focusing on ingestion or co-ingestion of substances (e.g. poisons, medications) rather than physical foreign objects.',
 8: 'Ingestions undertaken in controlled environment as part of voluntary study.',
 9: 'Ingestions not explicitly stated to be intentional and history not suggestive of deliberate ingestion (i.e. Age < 8, no history of previous ingestions, no psychiatric co-morbidities, not a prisoner/detainee/vulnerable group).',
 10: 'Does not meet inclusio