In [34]:
# Bibliography Search

In [35]:
# Setup notebook
from msc_code.scripts.notebook_setup import *

In [36]:
input_path = os.path.join(RAW_DATA_DIR, 'bibliography_search')
output_path = os.path.join(PROC_DATA_DIR, 'bibliography_search')

In [37]:
# import included studies
import_path = os.path.join(PROC_DATA_DIR, 'full_text_screen', 'full_text_screen_end_final.csv')
ft_data = pd.read_csv(import_path)
included_ids = list(ft_data[ft_data['Exclude_FINAL'] == False]['Study_ID'])

In [38]:
# Create dataframe for bibliography search
bib_search_df = pd.DataFrame(columns=['Study_ID', 'Bib_Search_Complete'])

bib_search_df['Study_ID'] = included_ids

file_path = os.path.join(output_path, 'bib_search_start.csv')
bib_search_df.to_csv(file_path, index=False)

In [39]:
# Reimport data from bibliography search
file_path = os.path.join(input_path, 'bib_search_end.xlsx')
bib_search_df = pd.read_excel(file_path)

In [40]:
# Filter out excluded papers
bib_search_df = bib_search_df[bib_search_df['Study_ID'].isin(included_ids)]

In [41]:
bib_search_df['Bib_Search_Complete'] = bib_search_df['Bib_Search_Complete'].astype(str)

status_map = {
    'nan': 'Awaiting Search',
    'Y': 'Search Complete',
    'N': 'For second review'
}

bib_search_df['Bib_Search_Complete'] = bib_search_df['Bib_Search_Complete'].map(status_map)

In [42]:
bib_search_df['Bib_Search_Complete'].value_counts()

Bib_Search_Complete
Search Complete    93
Name: count, dtype: int64

In [43]:
# Import bibliography search results
import_path = os.path.join(input_path, 'bib_search_results.csv')
bib_search_df = pd.read_csv(import_path)

# Sort Results by Publication Year, Author, Title ascending.
bib_search_df = bib_search_df.sort_values(by=['Publication Year', 'Author', 'Title'], ascending=True).reset_index(drop=True)

# Import initial results to
import_path = os.path.join(PROC_DATA_DIR, "title_abstract_review", "ta_review_start_jge.csv")
ta_start = pd.read_csv(import_path)

# Rename id column
ta_start = ta_start.rename(columns={
    'id': 'Study_ID'
})

# Find max study ID
starting_index = ta_start['Study_ID'].max() + 1

# Create Study_ID for items in bibliography search
bib_search_df['Study_ID'] = range(starting_index, starting_index + len(bib_search_df))

# Export to CSV
file_path = os.path.join(output_path, 'bib_search_results.csv')
bib_search_df.to_csv(file_path)

In [44]:
import os
import pandas as pd

# --- Load title/abstract review ---
ta_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_final.csv')
ta_review = pd.read_csv(ta_path)

# --- Load full-text screen ---
ft_path = os.path.join(PROC_DATA_DIR, 'full_text_screen', 'full_text_screen_end_final.csv')
ft_review = pd.read_csv(ft_path)

# --- Normalise helper ---
def clean_text(s):
    return str(s).strip().lower() if pd.notnull(s) else ''

# --- Normalize Titles and DOIs ---
bib_search_df['Title_clean'] = bib_search_df['Title'].apply(clean_text)
bib_search_df['DOI_clean'] = bib_search_df['DOI'].apply(clean_text)

ta_review['Title_clean'] = ta_review['Title'].apply(clean_text)
ta_review['DOI_clean'] = ta_review['DOI'].apply(clean_text)

# --- Step 1: Original count ---
print(f'{len(bib_search_df)} items in bib_search_results before duplicate removal')

# --- Step 2: Find duplicates
# Match by DOI (if DOI exists and isn't blank)
doi_matches = bib_search_df[
    (bib_search_df['DOI_clean'] != '') &
    bib_search_df['DOI_clean'].isin(
        ta_review['DOI_clean'].dropna().loc[lambda x: x != '']
    )
]

# Match by Title only where DOI is missing
title_matches = bib_search_df[
    (bib_search_df['DOI_clean'] == '') &
    bib_search_df['Title_clean'].isin(
        ta_review['Title_clean'].dropna().str.strip().str.lower()
    )
]

duplicates = pd.concat([doi_matches, title_matches]).drop_duplicates()

# --- Step 3: Remove duplicates from bib_search_df ---
duplicate_ids = duplicates['Study_ID'].tolist()
bib_search_df = bib_search_df[~bib_search_df['Study_ID'].isin(duplicate_ids)]

print(f'{len(bib_search_df)} items in bib_search_results after duplicate removal')

# --- Step 4: Prepare full-text exclusions ---
ta_review = ta_review.rename(columns={'id': 'Study_ID'})  # if needed

ft_review = pd.merge(
    ft_review,
    ta_review[['Study_ID', 'DOI_clean']],
    how='left',
    on='Study_ID'
)

# Mark excluded full-text articles
ft_excluded = ft_review[ft_review['Exclude_FINAL'] == True]

# --- Step 5: Exclude based on clean DOI ---
ft_excluded_dois = ft_excluded['DOI_clean'].dropna().loc[lambda x: x != ''].unique()

exclusions = bib_search_df[
    (bib_search_df['DOI_clean'] != '') &
    bib_search_df['DOI_clean'].isin(ft_excluded_dois)
]

print(f"{len(bib_search_df)} items in bib_search_results before full-text exclusions")
bib_search_df = bib_search_df[~bib_search_df['Study_ID'].isin(exclusions['Study_ID'])]
print(f"{len(bib_search_df)} items in bib_search_results after full-text exclusions")

# --- Step 6: Combine pre-screen exclusions ---
pre_screen_exclusions = pd.concat([duplicates, exclusions]).drop_duplicates()

# --- Step 7: Export results ---
# Debug output to audit dropped papers
duplicates_out = os.path.join(output_path, 'debug_duplicates_found.csv')
duplicates.to_csv(duplicates_out, index=False)

# Final pre-screen exclusion list
export_path = os.path.join(output_path, 'pre_screen_exclusions.csv')
pre_screen_exclusions.to_csv(export_path, index=False)

print("Pre-screen exclusions exported.")

204 items in bib_search_results before duplicate removal
192 items in bib_search_results after duplicate removal
192 items in bib_search_results before full-text exclusions
192 items in bib_search_results after full-text exclusions
Pre-screen exclusions exported.


In [45]:
# Rename to match screening format
bib_search_df = bib_search_df.rename(columns={
    'Access Date': 'Accessed'
})

# Define the screening columns
screening_columns = [
    'Study_ID', 'Publication Year', 'Author', 'Title',
    'Publication Title', 'Database', 'Exclude', 'Reason ID',
    'Paediatric', 'Intention Reported', 'Deliberate intention',
    'Unclear', 'Accessed', 'Comments'
]

# Create empty screening DataFrame with same index as bib_search_df
bib_search_screening = pd.DataFrame(index=bib_search_df.index, columns=screening_columns)

# Fill in matching columns from bib_search_df
for col in screening_columns:
    if col in bib_search_df.columns:
        bib_search_screening[col] = bib_search_df[col]

# Set the Database source
bib_search_screening['Database'] = "Bibliography Search"

# Safely convert numeric fields with missing values allowed
bib_search_screening['Publication Year'] = pd.to_numeric(
    bib_search_screening['Publication Year'], errors='coerce'
).astype('Int64')

bib_search_screening['Study_ID'] = pd.to_numeric(
    bib_search_screening['Study_ID'], errors='coerce'
).astype('Int64')

# Done — your DataFrame is now ready
bib_search_screening

Unnamed: 0,Study_ID,Publication Year,Author,Title,Publication Title,Database,Exclude,Reason ID,Paediatric,Intention Reported,Deliberate intention,Unclear,Accessed,Comments
0,492,1941,"Macmanus, Joseph E.",Perforations of the intestine by ingested fore...,The American Journal of Surgery,Bibliography Search,,,,,,,2025-04-14 17:11:45,
1,493,1962,"Perelman, H.",Toothpick perforations of the gastrointestinal...,The Journal of Abdominal Surgery,Bibliography Search,,,,,,,,
2,494,1967,"Sloop, R. D.; Thompson, J. C.",Aorto-esophageal fistula: report of a case and...,Gastroenterology,Bibliography Search,,,,,,,,
3,495,1969,"Johnson, Wilbur E.",On Ingestion of Razor Blades,JAMA,Bibliography Search,,,,,,,2025-04-14 14:35:26,
4,496,1969,"Schechter, D. C.; Gilbert, L.",Injuries of the heart and great vessels due to...,Thorax,Bibliography Search,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,691,,"Song, Young S; Covarrubias, Diego A; Nardi, Pe...",Foreign Body Appendicitis,,Bibliography Search,,,,,,,,
200,692,,,Endoscopic Retrieval of an Intentionally Inges...,,Bibliography Search,,,,,,,2025-04-14 13:57:27,
201,693,,,Escalating Ingestion of Razor Blades in a Pati...,Psychiatrist.com,Bibliography Search,,,,,,,2025-04-14 14:34:37,
202,694,,,"Frequent Deliberate Self-Harm: Repetition, Sui...",,Bibliography Search,,,,,,,2025-03-12 15:31:06,


In [46]:
# Export to CSV
export_path = os.path.join(output_path, 'bib_search_screen_start.csv')
bib_search_screening.to_csv(export_path, index=False)

In [47]:
# Import completed bib_search data extraction and screening
import_path = os.path.join(RAW_DATA_DIR, 'bibliography_search', 'bib_search_screen_end.xlsx')
bib_search_screen_end = pd.read_excel(import_path)

In [48]:
# Cleaning
bib_search_screen_end.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Study_ID              194 non-null    int64  
 1   Publication Year      194 non-null    int64  
 2   Author                190 non-null    object 
 3   Title                 194 non-null    object 
 4   Publication Title     192 non-null    object 
 5   Database              194 non-null    object 
 6   Exclude               194 non-null    bool   
 7   Reason ID             121 non-null    float64
 8   Paediatric            108 non-null    float64
 9   Intention Reported    139 non-null    float64
 10  Deliberate intention  137 non-null    float64
 11  Unclear               0 non-null      float64
 12  Accessed              177 non-null    object 
 13  Study_Design          156 non-null    object 
 14  Population_Type       99 non-null     object 
 15  Study_Setting         9

In [49]:
df = bib_search_screen_end.copy()

# Replace " " in column headings with "_"
df.columns = df.columns.str.strip(" ").str.replace(" ", "_")

df = df.rename(columns={
    'Deliberate_intention': 'Deliberate_Intention'
})

# Convert boolean columns to boolean
bool_cols = ['Exclude', 'Paediatric', 'Intention_Reported', 'Deliberate_Intention', 'Unclear']
df[bool_cols] = df[bool_cols].astype(bool)

# Convert integer columns to type integer
int_cols = ['Reason_ID']
df[int_cols] = df[int_cols].apply(pd.to_numeric, errors='coerce').astype('Int64')

# Convert category cols to type category
cat_cols = ['Data_Extracted', 'Bib_Search_Complete']
df[cat_cols] = df[cat_cols].astype('category')

df.head()

Unnamed: 0,Study_ID,Publication_Year,Author,Title,Publication_Title,Database,Exclude,Reason_ID,Paediatric,Intention_Reported,...,Unclear,Accessed,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Data_Extracted,Bib_Search_Complete,Comments
0,492,1941,"Macmanus, Joseph E.",Perforations of the intestine by ingested fore...,The American Journal of Surgery,Bibliography Search,True,9.0,False,False,...,True,2025-04-14 17:11:45,,,,,10.1016/S0002-9610(41)90652-9,N,N,Historical review and case report. Does not re...
1,493,1962,"Perelman, H.",Toothpick perforations of the gastrointestinal...,The Journal of Abdominal Surgery,Bibliography Search,True,1.0,True,True,...,True,,,,,,,N,N,Full text not available
2,494,1967,"Sloop, R. D.; Thompson, J. C.",Aorto-esophageal fistula: report of a case and...,Gastroenterology,Bibliography Search,True,1.0,True,True,...,True,,,,,,,N,N,Full text not available
3,495,1969,"Johnson, Wilbur E.",On Ingestion of Razor Blades,JAMA,Bibliography Search,False,,True,True,...,True,2025-04-14 14:35:26,Case Report,Single prisoner,"New Jersey State Prison Hospital, Trenton","Trenton, New Jersey, USA",10.1001/jama.1969.03160110135030,Y,N,
4,496,1969,"Schechter, D. C.; Gilbert, L.",Injuries of the heart and great vessels due to...,Thorax,Bibliography Search,True,2.0,False,True,...,True,2025-04-15 10:26:00,,,,,10.1136/thx.24.2.246,N,N,Insertion of pins into thoracic wall


In [50]:
df['Exclude'].value_counts()

Exclude
True     121
False     73
Name: count, dtype: int64

In [51]:
# Ensure each exclusion has a reason
df[(df['Exclude'] == True) & (df['Reason_ID'].isna())]


Unnamed: 0,Study_ID,Publication_Year,Author,Title,Publication_Title,Database,Exclude,Reason_ID,Paediatric,Intention_Reported,...,Unclear,Accessed,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Data_Extracted,Bib_Search_Complete,Comments


In [52]:
# Ensure each inclusion has no reason
df[(df['Exclude'] == False) & (~df['Reason_ID'].isna())]


Unnamed: 0,Study_ID,Publication_Year,Author,Title,Publication_Title,Database,Exclude,Reason_ID,Paediatric,Intention_Reported,...,Unclear,Accessed,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Data_Extracted,Bib_Search_Complete,Comments


In [53]:
# Export to CSV
export_path = os.path.join(PROC_DATA_DIR, 'bibliography_search', 'bib_search_final.csv')
bib_search_screen_clean = df.copy()
bib_search_screen_clean.to_csv(export_path)

In [54]:
# Export included items
bib_search_included = bib_search_screen_clean[bib_search_screen_clean['Exclude'] == False]
output_path = os.path.join(PROC_DATA_DIR, 'bibliography_search', 'bib_search_included_final.csv')
bib_search_included.to_csv(output_path, index=False)

In [55]:
bib_search_included

Unnamed: 0,Study_ID,Publication_Year,Author,Title,Publication_Title,Database,Exclude,Reason_ID,Paediatric,Intention_Reported,...,Unclear,Accessed,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Data_Extracted,Bib_Search_Complete,Comments
3,495,1969,"Johnson, Wilbur E.",On Ingestion of Razor Blades,JAMA,Bibliography Search,False,,True,True,...,True,2025-04-14 14:35:26,Case Report,Single prisoner,"New Jersey State Prison Hospital, Trenton","Trenton, New Jersey, USA",10.1001/jama.1969.03160110135030,Y,N,
6,498,1974,"Witzel, L.; Scheurer, U.; M√ºhlemann, A.; Halt...",Removal of razor blades from stomach with fibr...,British Medical Journal,Bibliography Search,False,,False,True,...,True,2025-04-15 10:39:00,Case Report,Single prisoner,Department of Surgery,"Berne, Switzerland",10.1136/bmj.2.5918.539,Y,N,
7,499,1977,"Devanesan, J.; Pisani, A.; Sharma, P.; Kazaria...",Metallic foreign bodies in the stomach,"Archives of Surgery (Chicago, Ill.: 1960)",Bibliography Search,False,,False,True,...,True,2025-04-15 10:45:00,Case Report,Single psychiatric inpatient,New York Medical College-Metropolitan Hospital...,"New York, USA",10.1001/archsurg.1977.01370050124025,Y,N,
12,504,1982,"James, A. H.; Allen-Mersh, T. G.",Recognition and management of patients who rep...,Journal of the Royal Society of Medicine,Bibliography Search,False,,True,True,...,True,2025-04-15 11:35:00,Retrospective Case Series,Mixed psychiatric and non-psychiatric,"Department of Surgery, Charing Cross Hospital,","London, UK",10.1177/014107688207500207,Y,N,
15,507,1983,"Roark, G D; Subramanyam, K; Patterson, M",Ingested foreign material in mentally disturbe...,Southern medical journal,Bibliography Search,False,,True,True,...,True,2025-04-10 09:01:14,Retrospective Case Series,Psychiatric patients,University of Texas,"Galveston, Texas, USA",10.1097/00007611-198309000-00015,Y,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,683,2022,"Yan, Tyler D.; Leung, Philemon H. Y.; Zwirewic...",An unusual cause of pericardial effusion: A ca...,International Journal of Surgery Case Reports,Bibliography Search,False,,False,True,...,True,2025-04-21 12:00:00,Case Report,A 59-year-old female living in an intensive te...,"Department of Surgery, University of British C...","Vancouver, British Colubia, Canada",10.1016/j.ijscr.2022.106931,Y,N,
183,684,2023,"Anand, Madhur; Topno, Noor; Lynrah, Kyrshanlan...",Doormatobezoar: first case report of a bezoar ...,International Surgery Journal,Bibliography Search,False,,True,True,...,True,2025-04-21 12:08:00,Case Report,A 12 years old male child presented to us with...,King George’s Medical University,"Lucknow, India",10.18203/2349-2902.isj20230996,Y,N,
184,686,2023,"Jin, Shengjian; Horiguchi, Taigo; Ma, Xiaolong...",Metallic foreign bodies ingestion by schizophr...,Annals of Medicine and Surgery,Bibliography Search,False,,False,True,...,True,2025-04-21 12:20:00,Case Report,a rare case of a 39-year-old man with schizoph...,"Department of Gastroenterological Surgery, Tan...","Tangshan, China",10.1097/MS9.0000000000000497,Y,N,
185,687,2023,"Ngu, Natalie Lee Yee; Karp, Jadon; Taylor, Kir...","Patient characteristics, outcomes and hospital...",BMJ Open Gastroenterology,Bibliography Search,False,,True,True,...,True,2025-03-30 10:21:11,Retrospective Chart Review,A retrospective cohort study of patients with ...,Non-prison referral centre,"Melbourne, Australia",10.1136/ bmjgast-2022-001087,Y,N,
