# Data Extraction

In [1567]:
# Import notebook setup
from msc_code.scripts.notebook_setup import *

Specify the methods used to collect data from reports, including how many reviewers collected data from each report, whether they worked independently, any processes for obtaining or confirming data from study investigators, and if applicable, details of automation tools used in the process.

In [1568]:
# Import first author (JGE) screened results as pandas dataframe.
import_path = os.path.join(PROC_DATA_DIR, "full_text_screen", "jge_included.csv")
jge_included_df = pd.read_csv(import_path)

In [1569]:
# Create dataframe of results to proceed with data extraction whilst second author reviews subset of results.
jge_data_extraction_df = jge_included_df[['id', 'Publication Year', 'Authors', 'Title', 'Publication Title']].set_index('id')

jge_data_extraction_df.to_csv("/".join([PROC_DATA_DIR, 'data_extraction', 'data_extraction_list.csv']))

In [1570]:
# Set import path for data_extraction.xlsx
import_path = os.path.join(RAW_DATA_DIR, "data_extraction", "data_extraction.xlsx")

# Import study_data
study_data = pd.read_excel(import_path, sheet_name="study_data") # Import study_data
patient_data = pd.read_excel(import_path, sheet_name="patient_data") # Import patient_data
time_data = pd.read_excel(import_path, sheet_name="time_data") # Import time_data
demographic_data = pd.read_excel(import_path, sheet_name="demographic_data") # Import demographic_data
object_raw_data = pd.read_excel(import_path, sheet_name="object_raw_data") # Import object_raw_data
object_gross_data = pd.read_excel(import_path, sheet_name="object_gross_data") # Import object_raw_data
intention_data = pd.read_excel(import_path, sheet_name="intention_data")
motivation_data = pd.read_excel(import_path, sheet_name="motivation_data") # Import motivation_data
intervention_data = pd.read_excel(import_path, sheet_name="intervention_data") # Import motivation_data
outcome_data = pd.read_excel(import_path, sheet_name="outcome_data") # Import outcome_data
symptom_data = pd.read_excel(import_path, sheet_name="symptom_data") # Import symptom_data
complication_data = pd.read_excel(import_path, sheet_name="complication_data") # Import complication_data
incidental_findings_data = pd.read_excel(import_path, sheet_name="incidental_findings_data") # Import incidental_findings_data


### Calculate Object Diameters

In [1571]:
# Check for duplicates
def calculate_object_diameters(object_raw_data):
    if object_raw_data['Object_ID'].duplicated().any():
        raise ValueError("Duplicate Object_IDs found!")

    object_dimensions = object_raw_data[['Object_ID', 'Object_Length_cm', 'Object_Width_cm', 'Object_Height_cm']].set_index('Object_ID')
    object_dimensions.replace("Unknown", np.nan, inplace=True)
    object_dimensions = object_dimensions.astype(float)

    # Compute Object Diameter
    def compute_diameter(row):
        available_values = row.dropna()  # Get non-null values in each row
        
        if len(available_values) == 1:
            return available_values.iloc[0]  # If only one dimension value exists, use it directly to compute object diameter
        
        elif len(available_values) == 2:
            return np.sqrt(sum(available_values**2))  # Use Pythagorean theorem if two values exist
        
        elif len(available_values) == 3:
            return max(available_values)  # If all three exist, take the max
        
        return np.nan  # If no values exist, return NaN

    # Apply function row-wise
    object_dimensions['Object_Diameter_cm'] = object_dimensions.apply(compute_diameter, axis=1)

    object_raw_data["Object_Diameter_cm"] = object_raw_data["Object_Diameter_cm"].fillna(object_raw_data["Object_ID"].map(object_dimensions["Object_Diameter_cm"]))

    return object_raw_data

# study_data["Age_Low_Years"].fillna(study_data["Study_ID"].map(age_min_map))
object_raw_data = calculate_object_diameters(object_raw_data=object_raw_data)


  object_dimensions.replace("Unknown", np.nan, inplace=True)


### Populate study_data from patient_data

#### Calculate Sample Size

In [1572]:
# Define function to calculate sample size
def calculate_sample_size(study_data, patient_data):

    # Count occurrences of Study_ID in patient_data
    patient_count = patient_data['Study_ID'].value_counts()

    # Map counts to study_data where Sample_Size is NaN
    study_data['Sample_Size'] = study_data.apply(
        lambda row: patient_count.get(row['Study_ID'], row['Sample_Size']) if pd.isna(row['Sample_Size']) else row['Sample_Size'],
        axis=1
    )

    # Ensure integer values
    study_data['Sample_Size'] = study_data['Sample_Size'].astype(int)

    return study_data

study_data = calculate_sample_size(study_data=study_data, patient_data=patient_data)


In [1573]:
# Where gross data is unavailable for populations included in a study, data is generated from individual patient data.

# Define Function to calculate Age_Min_Years in study_data with data from patient_data where currently blank
def calculate_age_low(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the minimum Age_Years for each Study_ID
    age_min_map = patient_data.groupby("Study_ID")["Age_Years"].min()

    # Only fill NaN values in Age_Low_Years
    study_data["Age_Low_Years"] = study_data["Age_Low_Years"].fillna(study_data["Study_ID"].map(age_min_map))
        
    return study_data

study_data = calculate_age_low(study_data=study_data, patient_data=patient_data)

  patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)


In [1574]:
# Define function to calculate Age_Max_Years in study_data with data from patient_data where currently blank
def calculate_age_high(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the maximum Age_Years for each Study_ID
    age_max_map = patient_data.groupby("Study_ID")["Age_Years"].max()

    # Only fill NaN values in Age_High_Years
    study_data["Age_High_Years"] = study_data["Age_High_Years"].fillna(study_data["Study_ID"].map(age_max_map))

    return study_data

study_data = calculate_age_high(study_data=study_data, patient_data=patient_data)

In [1575]:
# Define function to calculate Age_Mean_Years in study_data with data from patient_data where currently blank 
def calculate_age_mean(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the maximum Age_Years for each Study_ID
    age_mean_map = patient_data.groupby("Study_ID")["Age_Years"].mean()

    # Only fill NaN values in Age_High_Years
    study_data["Age_Mean_Years"] = study_data["Age_Mean_Years"].fillna(study_data["Study_ID"].map(age_mean_map))

    return study_data

study_data = calculate_age_mean(study_data=study_data, patient_data=patient_data)

In [1576]:
# Define function to calculate N_Female
def calculate_gender_counts(study_data, patient_data):
    # Ensure Gender values are consistent
    valid_genders = ["Male", "Female", "Unknown"]
    patient_data = patient_data[patient_data["Gender"].isin(valid_genders)]

    # Compute gender counts per Study_ID
    gender_counts = patient_data.groupby("Study_ID")["Gender"].value_counts().unstack(fill_value=0)

    # Ensure all expected columns exist
    for gender in valid_genders:
        if gender not in gender_counts.columns:
            gender_counts[gender] = 0  # Add missing gender columns

    # Map gender counts directly to the corresponding columns in study_data
    study_data["N_Gender_Male"] = study_data["Study_ID"].map(gender_counts.get("Male", {})).fillna(0).astype(int)
    study_data["N_Gender_Female"] = study_data["Study_ID"].map(gender_counts.get("Female", {})).fillna(0).astype(int)
    study_data["N_Gender_Unknown"] = study_data["Study_ID"].map(gender_counts.get("Unknown", {})).fillna(0).astype(int)

    return study_data

# Apply function
study_data = calculate_gender_counts(study_data=study_data, patient_data=patient_data)


In [1577]:
# Define function to calculate mortality rate in study_data from outcome_data
def calculate_mortality_count(study_data, outcome_data):
    # Ensure mortality recording is consistent
    valid_mortality_outcomes = ["Yes", "No", "Unknown"]
    outcome_data = outcome_data[outcome_data["Mortality"].isin(valid_mortality_outcomes)]

    # Drop duplicate Patient_ID to ensure each patient is counted only once
    unique_mortality = outcome_data.drop_duplicates(subset="Patient_ID")

    # Compute mortality counts per Study_ID
    mortality_counts = unique_mortality[unique_mortality["Mortality"] == "Yes"].groupby("Study_ID")["Patient_ID"].count()

    # Map mortality counts to study_data (fill NaN with 0)
    study_data["Mortality_Count"] = study_data["Study_ID"].map(mortality_counts).fillna(0).astype(int)

    # Replace zeros with NaN to avoid division by zero errors, then compute Mortality Rate
    study_data["Mortality_Rate"] = np.where(
        study_data["Sample_Size"] > 0, 
        study_data["Mortality_Count"] / study_data["Sample_Size"], 
        np.nan  # Set to NaN if division by zero would occur
    )

    return study_data

# Apply function
study_data = calculate_mortality_count(study_data=study_data, outcome_data=outcome_data)

In [1578]:
# Calculate complication rate
def calculate_complication_rate(study_data, complication_data):

    pass

# study_data = calculate_complication_rate(study_data)

In [1579]:
# Calculate intentional counts
def calculate_intention_counts(intention_data, motivation_data):

    intention_map = motivation_data.groupby("Study_ID")["Intentional"].value_counts().unstack(fill_value=0)

    # Create a mapping series for each category while ensuring missing keys return 0
    intentional_map = intention_map.get("Yes", pd.Series(0, index=intention_data["Study_ID"]))
    non_intentional_map = intention_map.get("No", pd.Series(0, index=intention_data["Study_ID"]))
    unknown_map = intention_map.get("Unknown", pd.Series(0, index=intention_data["Study_ID"]))

    # Only overwrite NaN values
    intention_data["N_Intentional_Ingestion"] = intention_data["N_Intentional_Ingestion"].fillna(intention_data["Study_ID"].map(intentional_map))
    intention_data["N_Non_Intentional_Ingestion"] = intention_data["N_Non_Intentional_Ingestion"].fillna(intention_data["Study_ID"].map(non_intentional_map))
    intention_data["N_Unknown_Intention"] = intention_data["N_Unknown_Intention"].fillna(intention_data["Study_ID"].map(unknown_map))

    return intention_data

intention_data = calculate_intention_counts(intention_data=intention_data, motivation_data=motivation_data)

In [1580]:
# Calculate Psychiatric History count
def calculate_n_psych_history(demographic_data, patient_data):

    psych_history_data = patient_data.groupby("Study_ID")["Psychiatric_History"].value_counts().unstack(fill_value=0)

    psych_history_map = psych_history_data.get("Yes", pd.Series(0, index=demographic_data["Study_ID"]))

    demographic_data["N_Psych_History"] = demographic_data["N_Psych_History"].fillna(demographic_data["Study_ID"].map(psych_history_map))

    return demographic_data

demographic_data = calculate_n_psych_history(demographic_data=demographic_data, patient_data=patient_data)

In [1581]:
# calculate_time_ingestion_to_presentation
def calculate_time_ingestion_to_presentation(object_raw_data):

    # Conversion factors
    time_multipliers = {
        'hours': 1,
        'days': 24,
        'weeks': 7 * 24,
        'months': 30.44 * 24,  # Average month length in hours
        'years': 365.25 * 24   # Account for leap years
    }

    # Function to convert time expressions to hours
    def convert_to_hours(time_str):
        if isinstance(time_str, str) and time_str.lower() == "unknown":
            return np.nan  # Keep unknowns as NaN

        match = re.match(r"([\d\.]+)(\w+)", str(time_str))
        if match:
            value, unit = match.groups()
            value = float(value)  # Convert number part to float
            if unit in time_multipliers:
                return value * time_multipliers[unit]  # Convert to hours
        return np.nan  # If it doesn't match expected format

    # Apply conversion, only overwriting NaN values
    object_raw_data["Time_Ingestion_To_Presentation_Hrs"] = object_raw_data["Time_Ingestion_To_Presentation_Hrs"].fillna(
        object_raw_data["Time_In_Situ_At_Presentation_Long"].apply(convert_to_hours)
    )

    return object_raw_data

object_raw_data = calculate_time_ingestion_to_presentation(object_raw_data=object_raw_data)

In [1582]:
# Calculate Time to Ingestion Categories
def calculate_time_to_presentation_type(object_raw_data):
    """
    Categorises ingestion time into different time types.
    
    Parameters:
    - object_data (pd.DataFrame): DataFrame containing 'Time_Ingestion_To_Presentation_Hrs'
    
    Returns:
    - object_data (pd.DataFrame): Updated DataFrame with 'Time_Ingestion_To_Presentation_Type'
    """

    # Define bins and labels
    bins = [0, 12, 48, np.inf]
    labels = ["<12hrs", "12-48hrs", ">48hrs"]

    # Categorize using pd.cut
    object_raw_data["Time_Ingestion_To_Presentation_Type"] = pd.cut(
        object_raw_data["Time_Ingestion_To_Presentation_Hrs"], bins=bins, labels=labels, include_lowest=True
    ).astype(str)  # Convert to string to allow "Unknown"

    # Assign "Unknown" where ingestion time is NaN
    object_raw_data.loc[object_raw_data["Time_Ingestion_To_Presentation_Hrs"].isna(), "Time_Ingestion_To_Presentation_Type"] = "Unknown"

    return object_raw_data

# Apply function to object_data
object_raw_data = calculate_time_to_presentation_type(object_raw_data)

In [1583]:
# Calculate time ingestion to intervention
def calculate_time_ingestion_to_intervention(intervention_data):

    # Conversion factors
    time_multipliers = {
        'hours': 1,
        'days': 24,
        'weeks': 7 * 24,
        'months': 30.44 * 24,  # Average month length in hours
        'years': 365.25 * 24   # Account for leap years
    }

    # Function to convert time expressions to hours
    def convert_to_hours(time_str):
        if isinstance(time_str, str) and time_str.lower() == "unknown":
            return np.nan  # Keep unknowns as NaN

        match = re.match(r"([\d\.]+)(\w+)", str(time_str))
        if match:
            value, unit = match.groups()
            value = float(value)  # Convert number part to float
            if unit in time_multipliers:
                return value * time_multipliers[unit]  # Convert to hours
        return np.nan  # If it doesn't match expected format

    # Apply conversion, only overwriting NaN values
    intervention_data["Time_Ingestion_To_Intervention_Hrs"] = intervention_data["Time_Ingestion_To_Intervention_Hrs"].fillna(
        intervention_data["Time_Ingestion_To_Intervention_Long"].apply(convert_to_hours)
    )

    return intervention_data

intervention_data = calculate_time_ingestion_to_intervention(intervention_data=intervention_data)


In [1584]:
def calculate_n_time_ingestion_to_presentation(object_raw_data, time_data):
    # valid_time_bins = ["<12hrs", "12-48hrs", ">48hrs"]

    time_window_map = object_raw_data.groupby("Study_ID")["Time_Ingestion_To_Presentation_Type"].value_counts().unstack(fill_value=0)
    
    time_data["N_Time_Ingestion_To_Presentation_<12hrs"] = time_data["N_Time_Ingestion_To_Presentation_<12hrs"].fillna(time_data["Study_ID"].map(time_window_map["<12hrs"]))
    time_data["N_Time_Ingestion_To_Presentation_12-48hrs"] = time_data["N_Time_Ingestion_To_Presentation_12-48hrs"].fillna(time_data["Study_ID"].map(time_window_map["12-48hrs"]))
    time_data["N_Time_Ingestion_To_Presentation_48hrs+"] = time_data["N_Time_Ingestion_To_Presentation_48hrs+"].fillna(time_data["Study_ID"].map(time_window_map[">48hrs"]))
    time_data["N_Time_Ingestion_To_Presentation_Unknown"] = time_data["N_Time_Ingestion_To_Presentation_Unknown"].fillna(time_data["Study_ID"].map(time_window_map["Unknown"]))
    return time_data

time_data = calculate_n_time_ingestion_to_presentation(object_raw_data=object_raw_data, time_data=time_data)
    

In [1585]:
# Categorise object initial location

In [1586]:
# Categorise object final location

In [1587]:
# calculate_n_successful_endoscopy()

    # Extraction success with endoscopy was defined as complete removal of the foreign bodies without complications or surgical intervention

In [1588]:
# Export all data
study_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "study_data.csv"]), index=False)
patient_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "patient_data.csv"]), index=False)
time_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "time_data.csv"]), index=False)
demographic_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "demographic_data.csv"]), index=False)
object_raw_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "object_raw_data.csv"]), index=False)
object_gross_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "object_gross_data.csv"]), index=False)
intention_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "intention_data.csv"]), index=False)
motivation_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "motivation_data.csv"]), index=False)
intervention_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "intervention_data.csv"]), index=False)
outcome_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "outcome_data.csv"]), index=False)
symptom_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "symptom_data.csv"]), index=False)
complication_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "complication_data.csv"]), index=False)
incidental_findings_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "incidental_findings_data.csv"]), index=False)