# Data Extraction

In [306]:
# Import notebook setup
from msc_code.scripts.notebook_setup import *

Specify the methods used to collect data from reports, including how many reviewers collected data from each report, whether they worked independently, any processes for obtaining or confirming data from study investigators, and if applicable, details of automation tools used in the process.

In [307]:
# Import first author (JGE) screened results as pandas dataframe.
import_path = os.path.join(PROC_DATA_DIR, "full_text_screen", "jge_included.csv")
jge_included_df = pd.read_csv(import_path)

In [308]:
jge_included_df.columns

Index(['id', 'Publication Year', 'Authors', 'Title', 'Publication Title',
       'Database', 'Exclude', 'Reason ID', 'Paediatric', 'Intention Reported',
       'Deliberate intention', 'Unclear', 'Accessed', 'Comments'],
      dtype='object')

In [309]:
# Create dataframe of results to proceed with data extraction whilst second author reviews subset of results.
jge_data_extraction_df = jge_included_df[['id', 'Publication Year', 'Authors', 'Title', 'Publication Title']].set_index('id')

jge_data_extraction_df.to_csv("/".join([PROC_DATA_DIR, 'data_extraction', 'data_extraction_list.csv']))

In [310]:
# Set import path for data_extraction.xlsx
import_path = os.path.join(RAW_DATA_DIR, "data_extraction", "data_extraction.xlsx")

# Import study_data
study_data = pd.read_excel(import_path, sheet_name="study_data") # Import study_data
patient_data = pd.read_excel(import_path, sheet_name="patient_data") # Import patient_data
object_data = pd.read_excel(import_path, sheet_name="object_data") # Import object_data
motivation_data = pd.read_excel(import_path, sheet_name="motivation_data") # Import motivation_data
intervention_data = pd.read_excel(import_path, sheet_name="intervention_data") # Import motivation_data
outcome_data = pd.read_excel(import_path, sheet_name="outcome_data") # Import outcome_data
symptom_data = pd.read_excel(import_path, sheet_name="symptom_data") # Import symptom_data
complication_data = pd.read_excel(import_path, sheet_name="complication_data") # Import complication_data
incidental_findings_data = pd.read_excel(import_path, sheet_name="incidental_findings_data") # Import incidental_findings_data


### Calculate Object Diameters

In [311]:
# Check for duplicates
if object_data['Object_ID'].duplicated().any():
    raise ValueError("Duplicate Object_IDs found!")

object_dimensions = object_data[['Object_ID', 'Object_Length_cm', 'Object_Width_cm', 'Object_Height_cm']].set_index('Object_ID')

object_dimensions.replace("Unknown", np.nan, inplace=True)
object_dimensions = object_dimensions.astype(float)

# Compute Object Diameter
def compute_diameter(row):
    available_values = row.dropna()  # Get non-null values in each row
    
    if len(available_values) == 1:
        return available_values.iloc[0]  # If only one dimension value exists, use it directly to compute object diameter
    
    elif len(available_values) == 2:
        return np.sqrt(sum(available_values**2))  # Use Pythagorean theorem if two values exist
    
    elif len(available_values) == 3:
        return max(available_values)  # If all three exist, take the max
    
    return np.nan  # If no values exist, return NaN

# Apply function row-wise
object_dimensions['Object_Diameter_cm'] = object_dimensions.apply(compute_diameter, axis=1)

# Create Object Diameter
object_dimensions.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "object_dimensions.csv"]))

  object_dimensions.replace("Unknown", np.nan, inplace=True)


### Populate study_data from patient_data

#### Calculate Sample Size

In [312]:
# Define function to calculate sample size
def calculate_sample_size(study_data, patient_data):

    # Count occurrences of Study_ID in patient_data
    patient_count = patient_data['Study_ID'].value_counts()

    # Map counts to study_data where Sample_Size is NaN
    study_data['Sample_Size'] = study_data.apply(
        lambda row: patient_count.get(row['Study_ID'], row['Sample_Size']) if pd.isna(row['Sample_Size']) else row['Sample_Size'],
        axis=1
    )

    # Ensure integer values
    study_data['Sample_Size'] = study_data['Sample_Size'].astype(int)

    return study_data

study_data = calculate_sample_size(study_data=study_data, patient_data=patient_data)


In [313]:
# Define Function to calculate Age_Min_Years in study_data with data from patient_data where currently blank
def calculate_age_low(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the minimum Age_Years for each Study_ID
    age_min_map = patient_data.groupby("Study_ID")["Age_Years"].min()

    # Only fill NaN values in Age_Low_Years
    study_data["Age_Low_Years"] = study_data["Age_Low_Years"].fillna(study_data["Study_ID"].map(age_min_map))
        
    return study_data

study_data = calculate_age_low(study_data=study_data, patient_data=patient_data)

  patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)


In [314]:
# Define function to calculate Age_Max_Years in study_data with data from patient_data where currently blank
def calculate_age_high(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the maximum Age_Years for each Study_ID
    age_max_map = patient_data.groupby("Study_ID")["Age_Years"].max()

    # Only fill NaN values in Age_High_Years
    study_data["Age_High_Years"] = study_data["Age_High_Years"].fillna(study_data["Study_ID"].map(age_max_map))

    return study_data

study_data = calculate_age_high(study_data=study_data, patient_data=patient_data)

In [315]:
# Define function to calculate Age_Mean_Years in study_data with data from patient_data where currently blank 
def calculate_age_mean(study_data, patient_data):

    # Replace "Unknown" values with NaN
    patient_data['Age_Years'] = patient_data['Age_Years'].replace("Unknown", np.nan)

    # Compute the maximum Age_Years for each Study_ID
    age_mean_map = patient_data.groupby("Study_ID")["Age_Years"].mean()

    # Only fill NaN values in Age_High_Years
    study_data["Age_Mean_Years"] = study_data["Age_Mean_Years"].fillna(study_data["Study_ID"].map(age_mean_map))

    return study_data

study_data = calculate_age_mean(study_data=study_data, patient_data=patient_data)

In [316]:
# Define function to calculate N_Female
def calculate_gender_counts(study_data, patient_data):
    # Ensure Gender values are consistent
    valid_genders = ["Male", "Female", "Unknown"]
    patient_data = patient_data[patient_data["Gender"].isin(valid_genders)]

    # Compute gender counts per Study_ID
    gender_counts = patient_data.groupby("Study_ID")["Gender"].value_counts().unstack(fill_value=0)

    # Ensure all expected columns exist
    for gender in valid_genders:
        if gender not in gender_counts.columns:
            gender_counts[gender] = 0  # Add missing gender columns

    # Map gender counts directly to the corresponding columns in study_data
    study_data["N_Gender_Male"] = study_data["Study_ID"].map(gender_counts.get("Male", {})).fillna(0).astype(int)
    study_data["N_Gender_Female"] = study_data["Study_ID"].map(gender_counts.get("Female", {})).fillna(0).astype(int)
    study_data["N_Gender_Unknown"] = study_data["Study_ID"].map(gender_counts.get("Unknown", {})).fillna(0).astype(int)

    return study_data

# Apply function
study_data = calculate_gender_counts(study_data=study_data, patient_data=patient_data)


In [317]:
# Define function to calculate mortality rate in study_data from outcome_data
def calculate_mortality_count(study_data, outcome_data):
    # Ensure mortality recording is consistent
    valid_mortality_outcomes = ["Yes", "No", "Unknown"]
    outcome_data = outcome_data[outcome_data["Mortality"].isin(valid_mortality_outcomes)]

    # Drop duplicate Patient_ID to ensure each patient is counted only once
    unique_mortality = outcome_data.drop_duplicates(subset="Patient_ID")

    # Compute mortality counts per Study_ID
    mortality_counts = unique_mortality[unique_mortality["Mortality"] == "Yes"].groupby("Study_ID")["Patient_ID"].count()

    # Map mortality counts to study_data (fill NaN with 0)
    study_data["Mortality_Count"] = study_data["Study_ID"].map(mortality_counts).fillna(0).astype(int)

    # Replace zeros with NaN to avoid division by zero errors, then compute Mortality Rate
    study_data["Mortality_Rate"] = np.where(
        study_data["Sample_Size"] > 0, 
        study_data["Mortality_Count"] / study_data["Sample_Size"], 
        np.nan  # Set to NaN if division by zero would occur
    )

    return study_data

# Apply function
study_data = calculate_mortality_count(study_data=study_data, outcome_data=outcome_data)

In [318]:
# Calculate complication rate
def calculate_complication_rate(study_data, complication_data):

    pass

# study_data = calculate_complication_rate(study_data)

In [319]:


    



# Export study_Data to CSV
study_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "study_data.csv"]))


#### Calculate Gender Distribution

In [None]:
# Export all data
study_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "study_data.csv"]))
patient_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "patient_data.csv"]))
object_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "object_data.csv"]))
motivation_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "motivation_data.csv"]))
intervention_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "intervention_data.csv"]))
outcome_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "outcome_data.csv"]))
symptom_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "symptom_data.csv"]))
complication_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "motivation_data.csv"]))
incidental_findings_data.to_csv("/".join([PROC_DATA_DIR, "data_extraction", "incidental_findings_data.csv"]))