In [19]:
# Import setup
from msc_code.scripts.notebook_setup import *

In [20]:
# Import data 
import_path = os.path.join(RAW_DATA_DIR, 'data_extraction', 'data_extraction_end.xlsx')
primary_data_extraction = pd.read_excel(import_path)

In [21]:
# Import case_data
import_path = os.path.join(PROC_DATA_DIR, 'data_extraction', 'case_data_clean.csv')
case_data = pd.read_csv(import_path)

# Import series_data
import_path = os.path.join(PROC_DATA_DIR, 'data_extraction', 'series_data_clean.csv')
series_data = pd.read_csv(import_path)

In [22]:
# Ensure all papers have had data extracted

# Create list of Study_IDs which are present in series_data
ids_from_series_data = series_data['Study_ID'].dropna().unique().tolist()

# Create list of Study_IDs which are present in case_data
ids_from_case_data = case_data['Study_ID'].dropna().unique().tolist()

# Create a list of all Study_IDs present in data extraction
ids_from_data_extraction = ids_from_series_data + ids_from_case_data

# Import additional paper data from JGE full text screening
import_path = os.path.join(PROC_DATA_DIR, 'full_text_screen', 'full_text_screen_included_final.csv')
ft_included = pd.read_csv(import_path)

# Make sure all included papers from full text screening are present in data extraction
ft_included_ids = ft_included['Study_ID'].dropna().unique().tolist()

# Create list of missing IDs, where data not extracted
missing_ids = list(set(ft_included_ids) - set(ids_from_data_extraction))
missing_ids

[386,
 261,
 138,
 399,
 273,
 402,
 405,
 416,
 39,
 300,
 431,
 439,
 195,
 451,
 327,
 460,
 333,
 465,
 475,
 476,
 353,
 99,
 356,
 483,
 231,
 359,
 369,
 370,
 373,
 377]

In [23]:
# Make sure all bibliography search results have had data extracted

# Import bib_search data
import_path = os.path.join(PROC_DATA_DIR, 'bibliography_search', 'bib_search_included_final.csv')
bib_search_data = pd.read_csv(import_path)

# Create list of ids from bib_search
bib_search_included_ids = bib_search_data['Study_ID'].dropna().unique().tolist()

# Create list of IDs from bibliography search that aren't
missing_ids = list(set(bib_search_included_ids) - set(ids_from_data_extraction))
missing_ids

[644,
 651,
 652,
 532,
 539,
 540,
 675,
 683,
 556,
 684,
 559,
 566,
 572,
 574,
 576,
 579,
 583,
 589,
 592,
 593,
 597,
 604,
 620,
 621,
 495,
 625,
 498,
 499,
 507]

In [24]:
# Define desried columns
desired_cols = ['Study_ID', 'Study_Design', 'Population_Type', 'Study_Setting', 'Study_Location', 'DOI']

# Create df with desired columns
bib_search_data = bib_search_data[desired_cols]

paper_data = primary_data_extraction[desired_cols]

paper_data = pd.concat([paper_data, bib_search_data])

paper_data = paper_data.sort_values(by='Study_ID', ascending=True).reset_index(drop=True)

In [25]:

# Import data from full text screen jge for additional paper data

# Import
import_path = os.path.join(RAW_DATA_DIR, 'full_text_screen', 'full_text_screen_end_jge.csv')
additional_paper_data = pd.read_csv(import_path)

additional_paper_data.columns = additional_paper_data.columns.str.strip(" ").str.replace(" ", "_")

# rename id column to Study_ID
additional_paper_data = additional_paper_data.rename(columns={
    'id': 'Study_ID',
    'Deliberate_intention': 'Deliberate_Intention'
})

additional_paper_data.columns

bool_cols = ['Exclude', 'Paediatric', 'Intention_Reported', 'Deliberate_Intention', 'Unclear']
additional_paper_data[bool_cols] = additional_paper_data[bool_cols].fillna(False).astype(bool)

# Define desired columns
print(additional_paper_data.columns)
desired_cols = ['Study_ID', 'Publication_Year', 'Authors', 'Title', 'Publication_Title', 'Database',
                'Paediatric', 'Intention_Reported', 'Deliberate_Intention', 'Unclear', 'Accessed', 'Comments']

# Extract 
additional_paper_data = additional_paper_data[desired_cols]

cols_to_drop = [col for col in additional_paper_data.columns if col in paper_data.columns and col != 'Study_ID']
paper_data = paper_data.drop(columns=cols_to_drop)

# Merge with paper_data
paper_data = pd.merge(
    paper_data,
    additional_paper_data,
    how='left',
    on='Study_ID'
)

# Make sure all bibliography search results have had data extracted
import_path = os.path.join(PROC_DATA_DIR, 'bibliography_search', 'bib_search_included_final.csv')
additional_paper_data = pd.read_csv(import_path)

additional_paper_data = additional_paper_data.rename(columns={
    'Author': 'Authors'
})

# Set Study_ID as index for alignment
paper_data = paper_data.set_index('Study_ID')
additional_paper_data = additional_paper_data.set_index('Study_ID')
additional_paper_data[bool_cols] = additional_paper_data[bool_cols].fillna(False).astype(bool)

# Get shared columns (excluding any that might not exist in paper_data)
shared_columns = [col for col in additional_paper_data.columns if col in paper_data.columns]

# Fill missing values in paper_data using values from additional_paper_data
paper_data.update(additional_paper_data[shared_columns])

# Reset index if you want Study_ID back as a column
paper_data = paper_data.reset_index()

paper_data

Index(['Study_ID', 'Publication_Year', 'Authors', 'Title', 'Publication_Title',
       'Database', 'Exclude', 'Reason_ID', 'Paediatric', 'Intention_Reported',
       'Deliberate_Intention', 'Unclear', 'Accessed', 'Comments'],
      dtype='object')


  additional_paper_data[bool_cols] = additional_paper_data[bool_cols].fillna(False).astype(bool)


Unnamed: 0,Study_ID,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Publication_Year,Authors,Title,Publication_Title,Database,Paediatric,Intention_Reported,Deliberate_Intention,Unclear,Accessed,Comments
0,3,Case Series,,Historical,USA,10.1056/NEJM188612161152403,1886.0,Mh Richardson,A Case Of Gastrotomy. Digital Exploration Of S...,The Boston Medical And Surgical Journal,Google Scholar,False,True,True,False,29/01/2025,Record of first gastrotomy for intention inges...
1,39,Case Report,,,Japan,,1993.0,"T Ken, Y Sunichi, U Toshiro, T Tomoo...",Endoscopic Removal Of Foreign Bodies In The Me...,Chinese Medical ...,Google Scholar,False,True,True,False,29/01/2025,Case report of endoscopic management of multip...
2,46,Case Series,Mixed Prisoner Psychiatric Inpatient Population,Department of Plastic & Reconstructive Surgery...,"Wilton, Cork, Ireland",10.1007/bf02943095,1996.0,"O'Sullivan, S. T.; Reardon, C. M.; Mcgreal, G....",Deliberate Ingestion Of Foreign Bodies By Inst...,Irish Journal Of Medical Science,PubMed,False,True,True,False,29/01/2025,Case series or 36 prisoners who intentionally ...
3,51,Case Series,,University Hospital in Bulgaria,Bulgaria,10.1001/archsurg.1996.01430140056015,1996.0,"Losanoff, J. E.; Kjossev, K. T.",Gastrointestinal 'Crosses'. A New Shade From A...,"Archives Of Surgery (Chicago, Ill. : 1960)",PubMed,False,True,True,False,29/01/2025,Case series. Gastrointestinal crosses causing ...
4,54,Case Report,,University Hospital in Bulgaria,Bulgaria,10.1136/emj.14.1.54,1997.0,"Losanoff, J. E.; Kjossev, K. T.; Losanoff, H. E.","Oesophageal ""Cross""--A Sinister Foreign Body",Journal Of Accident & Emergency Medicine,PubMed,False,True,True,False,29/01/2025,Single case of gastrointestinal cross.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,683,Case Report,A 59-year-old female living in an intensive te...,"Department of Surgery, University of British C...","Vancouver, British Colubia, Canada",10.1016/j.ijscr.2022.106931,2022.0,"Yan, Tyler D.; Leung, Philemon H. Y.; Zwirewic...",An unusual cause of pericardial effusion: A ca...,International Journal of Surgery Case Reports,Bibliography Search,False,True,True,True,2025-04-21 12:00:00,
163,684,Case Report,A 12 years old male child presented to us with...,King George’s Medical University,"Lucknow, India",10.18203/2349-2902.isj20230996,2023.0,"Anand, Madhur; Topno, Noor; Lynrah, Kyrshanlan...",Doormatobezoar: first case report of a bezoar ...,International Surgery Journal,Bibliography Search,True,True,True,True,2025-04-21 12:08:00,
164,686,Case Report,a rare case of a 39-year-old man with schizoph...,"Department of Gastroenterological Surgery, Tan...","Tangshan, China",10.1097/MS9.0000000000000497,2023.0,"Jin, Shengjian; Horiguchi, Taigo; Ma, Xiaolong...",Metallic foreign bodies ingestion by schizophr...,Annals of Medicine and Surgery,Bibliography Search,False,True,True,True,2025-04-21 12:20:00,
165,687,Retrospective Chart Review,A retrospective cohort study of patients with ...,Non-prison referral centre,"Melbourne, Australia",10.1136/ bmjgast-2022-001087,2023.0,"Ngu, Natalie Lee Yee; Karp, Jadon; Taylor, Kir...","Patient characteristics, outcomes and hospital...",BMJ Open Gastroenterology,Bibliography Search,True,True,True,True,2025-03-30 10:21:11,


In [26]:
paper_data

Unnamed: 0,Study_ID,Study_Design,Population_Type,Study_Setting,Study_Location,DOI,Publication_Year,Authors,Title,Publication_Title,Database,Paediatric,Intention_Reported,Deliberate_Intention,Unclear,Accessed,Comments
0,3,Case Series,,Historical,USA,10.1056/NEJM188612161152403,1886.0,Mh Richardson,A Case Of Gastrotomy. Digital Exploration Of S...,The Boston Medical And Surgical Journal,Google Scholar,False,True,True,False,29/01/2025,Record of first gastrotomy for intention inges...
1,39,Case Report,,,Japan,,1993.0,"T Ken, Y Sunichi, U Toshiro, T Tomoo...",Endoscopic Removal Of Foreign Bodies In The Me...,Chinese Medical ...,Google Scholar,False,True,True,False,29/01/2025,Case report of endoscopic management of multip...
2,46,Case Series,Mixed Prisoner Psychiatric Inpatient Population,Department of Plastic & Reconstructive Surgery...,"Wilton, Cork, Ireland",10.1007/bf02943095,1996.0,"O'Sullivan, S. T.; Reardon, C. M.; Mcgreal, G....",Deliberate Ingestion Of Foreign Bodies By Inst...,Irish Journal Of Medical Science,PubMed,False,True,True,False,29/01/2025,Case series or 36 prisoners who intentionally ...
3,51,Case Series,,University Hospital in Bulgaria,Bulgaria,10.1001/archsurg.1996.01430140056015,1996.0,"Losanoff, J. E.; Kjossev, K. T.",Gastrointestinal 'Crosses'. A New Shade From A...,"Archives Of Surgery (Chicago, Ill. : 1960)",PubMed,False,True,True,False,29/01/2025,Case series. Gastrointestinal crosses causing ...
4,54,Case Report,,University Hospital in Bulgaria,Bulgaria,10.1136/emj.14.1.54,1997.0,"Losanoff, J. E.; Kjossev, K. T.; Losanoff, H. E.","Oesophageal ""Cross""--A Sinister Foreign Body",Journal Of Accident & Emergency Medicine,PubMed,False,True,True,False,29/01/2025,Single case of gastrointestinal cross.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,683,Case Report,A 59-year-old female living in an intensive te...,"Department of Surgery, University of British C...","Vancouver, British Colubia, Canada",10.1016/j.ijscr.2022.106931,2022.0,"Yan, Tyler D.; Leung, Philemon H. Y.; Zwirewic...",An unusual cause of pericardial effusion: A ca...,International Journal of Surgery Case Reports,Bibliography Search,False,True,True,True,2025-04-21 12:00:00,
163,684,Case Report,A 12 years old male child presented to us with...,King George’s Medical University,"Lucknow, India",10.18203/2349-2902.isj20230996,2023.0,"Anand, Madhur; Topno, Noor; Lynrah, Kyrshanlan...",Doormatobezoar: first case report of a bezoar ...,International Surgery Journal,Bibliography Search,True,True,True,True,2025-04-21 12:08:00,
164,686,Case Report,a rare case of a 39-year-old man with schizoph...,"Department of Gastroenterological Surgery, Tan...","Tangshan, China",10.1097/MS9.0000000000000497,2023.0,"Jin, Shengjian; Horiguchi, Taigo; Ma, Xiaolong...",Metallic foreign bodies ingestion by schizophr...,Annals of Medicine and Surgery,Bibliography Search,False,True,True,True,2025-04-21 12:20:00,
165,687,Retrospective Chart Review,A retrospective cohort study of patients with ...,Non-prison referral centre,"Melbourne, Australia",10.1136/ bmjgast-2022-001087,2023.0,"Ngu, Natalie Lee Yee; Karp, Jadon; Taylor, Kir...","Patient characteristics, outcomes and hospital...",BMJ Open Gastroenterology,Bibliography Search,True,True,True,True,2025-03-30 10:21:11,


In [27]:
(paper_data['Unclear'] == True).value_counts()

Unclear
False    92
True     75
Name: count, dtype: int64