In [1]:
# Creating a baseline metadata
import pandas as pd
import os
import numpy as np

parent = '/home/jkim/research/peds_cxr/'
metadata = pd.read_csv(parent + '25variation/metadata/final/aggregate/aggregate_metadata_seed_9.csv')

def create_correct_patient_id(row):
    if row['Dataset'] == 'VinDR':
        return row['Image Index'].replace('.png', '')
    elif row['Dataset'] == 'NIH':
        patient_id, _ = row['Image Index'].split('_')
        return patient_id
    else:
        return None

def categorize_age_group(row):
    if 0 <= row['Patient Age'] <= 5:
        return 'young'
    elif 6 <= row['Patient Age'] <= 17:
        return 'old'
    else:
        return 'not applicable'

metadata['Age Group'] = metadata.apply(categorize_age_group, axis=1)
metadata['Patient_id'] = metadata.apply(create_correct_patient_id, axis=1) 

column = ['Image Index','Patient_id','Dataset','Patient Age','Age Group','Patient Gender','No Finding','Cardiomegaly','Consolidation','Infiltration','Mass/Nodule','Pneumonia']
metadata = metadata[column]

# Create Positive Label adn Patient Patient column
positive_conditions = ['Pneumonia', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule'] 
metadata['Positive_Label'] = metadata[positive_conditions].any(axis=1).astype(int)
positive_patients = metadata[metadata['Positive_Label'] == 1]['Patient_id'].unique()
metadata['Positive_Patient'] = metadata['Patient_id'].isin(positive_patients).astype(int)


metadata.to_csv(parent + 'no_finding/metadata/base_metadata.csv', index=False)

In [2]:
import os 
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]

for seed in predefined_seeds:
    os.makedirs(parent + f'no_finding/metadata/aggregate_sex_nf/aggregate_sex_nf_seed_{seed}', exist_ok=True)
    os.makedirs(parent + f'no_finding/metadata/aggregate_age_nf/aggregate_age_nf_seed_{seed}', exist_ok=True)
    os.makedirs(parent + f'no_finding/experiment/metadata_count/seed{seed}', exist_ok=True)
    os.makedirs(parent + f'no_finding/experiment/metadata_count/seed{seed}', exist_ok=True)


In [4]:
import pandas as pd
import random
import os

# Load the CSV file
file_path = parent + '/no_finding/metadata/base_metadata.csv'
df = pd.read_csv(file_path)

# Define the ratios and diagnostic labels
ratios = [(0, 100), (25, 75), (50, 50), (75, 25), (100, 0)]
diagnostic_labels = ['Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']
dl = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']

# Function to calculate and print diagnostic label counts for each set
def print_label_counts(df, set_name):
    print(f"\nDiagnostic Label Counts for {set_name} Set:")
    for gender_group in ['M', 'F']:
        print(f"Gender Group: {gender_group}")
        df_set = df[(df['Set'].str.contains(set_name)) & (df['Patient Gender'] == gender_group)]
        label_counts = df_set[dl].sum()
        print(label_counts)

# Helper function to assign patients to a given set
def assign_patients_to_set(df, gender_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient=None):
    df_gender_group = df[(df['Patient Gender'] == gender_group) & (df['Set'] == '')] 
    
    # Filter for patients with specified positive or negative label
    if positive_patient == 1:
        patients = df_gender_group[df_gender_group['Positive_Patient'] == positive_patient].groupby('Patient_id').filter(lambda x: len(x) <= 3 if max_images_per_patient else True)
    else:
        patients = df_gender_group[(df_gender_group['Positive_Label'] == positive_label) & (df_gender_group['Positive_Patient'] == positive_patient)].groupby('Patient_id').filter(lambda x: len(x) <= 3 if max_images_per_patient else True)

    unique_patients = patients['Patient_id'].unique()
    
    # Shuffle patients
    random.seed(seed)
    random.shuffle(unique_patients)
    
    count = 0  # Track the number of images assigned
    assigned_patients = set()
    
    for patient_id in unique_patients: 
        patient_rows = df[df['Patient_id'] == patient_id] 
        if positive_patient == 1: 
            label_count = patient_rows['Positive_Label'].sum()  # Count positive images 
        else: 
            label_count = len(patient_rows[patient_rows['Positive_Label'] == positive_label])  # Count negative images 

        # Check if adding this patient exceeds the total count limit
        if count + label_count <= total_count:
            df.loc[df['Patient_id'] == patient_id, 'Set'] = set_name  # Assign patient to the set
            count += label_count
            assigned_patients.add(patient_id)  # Keep track of assigned patients

        if count >= total_count:
            break
    
    return df, assigned_patients

# Function to check if each diagnostic label in each set contains at least 2 images for old and young groups
def check_age_balance_per_label(df, set_name, min_count=2):
    for label in diagnostic_labels:
        df_filtered = df[(df['Set'].str.contains(set_name)) & (df['Positive_Label'] == 1) & (df[label] == 1)]
        old_count = df_filtered[df_filtered['Age Group'] == 'old'].shape[0]
        young_count = df_filtered[df_filtered['Age Group'] == 'young'].shape[0]
        if old_count < min_count or young_count < min_count:
            print(f"{set_name} set for label {label} does not meet age balance. Old: {old_count}, Young: {young_count}")
            return False
    return True

# Function to perform assignments with checking conditions and changing seeds if conditions aren't met
def assign_with_conditions(df, gender_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient=None, min_count=1, retry_limit=2000):
    # Skip the function entirely if the ratio for this sex group is 0
    if (gender_group == 'M' and total_count == 0) or (gender_group == 'F' and total_count == 0):
        print(f"Skipping {set_name} assignment for {gender_group} as the ratio is 0.")
        return df

    retry_count = 0
    while retry_count < retry_limit:  # Retry until conditions are met or retry limit is reached
        df_temp = df[df['Set'] == ''].copy()  # Work with a copy of the original DataFrame

        # Perform the assignment
        df_temp, _ = assign_patients_to_set(df_temp, gender_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient)

        # Check if the assigned set meets the age balance condition per diagnostic label
        if check_age_balance_per_label(df_temp, set_name, min_count):
            # If conditions are met, update the original DataFrame and break
            df.update(df_temp)
            print(f"Conditions met for {set_name} with seed {seed}.")
            return df
        else:
            retry_count += 1
            # Change to a new random seed
            seed = seed + 1
            print(f"Conditions not met for {set_name}. Changing seed to {seed} and retrying... (Attempt {retry_count}/{retry_limit})")

    # If retry limit is reached, raise an error to stop the code
    error_message = f"Retry limit of {retry_limit} reached for {set_name}. The last seed {seed} did not meet the requirement."
    print(error_message)
    raise RuntimeError(error_message)


    # Print details of the unmet requirements
    for label in diagnostic_labels:
        df_filtered = df_temp[(df_temp['Set'].str.contains(set_name)) & (df_temp['Positive_Label'] == 1) & (df_temp[label] == 1)]
        old_count = df_filtered[df_filtered['Age Group'] == 'old'].shape[0]
        young_count = df_filtered[df_filtered['Age Group'] == 'young'].shape[0]
        print(f"Label {label} in {set_name} set: Old: {old_count}, Young: {young_count}")

    return df

# Create the datasets using the new function
dfs = {}  # Dictionary to hold the generated DataFrames

# First, assign the test set (this will remain the same across all splits)
df['Set'] = ''  # Reset the 'Set' column
male_ratio = 1
female_ratio = 1
df = assign_with_conditions(df, 'M', 1, 1, 157, 'test', seed=10, max_images_per_patient=3)
df = assign_with_conditions(df, 'F', 1, 1, 157, 'test', seed=10, max_images_per_patient=3)
df = assign_patients_to_set(df, 'M', 0, 0, 378, 'test', seed=10, max_images_per_patient=3)[0]
df = assign_patients_to_set(df, 'F', 0, 0, 378, 'test', seed=10, max_images_per_patient=3)[0]

print_label_counts(df, 'test')

# Now, iterate over each ratio and seed to create datasets
for seed_idx, seed in enumerate(predefined_seeds):
    for ratio_idx, (male_ratio, female_ratio) in enumerate(ratios):
        # Create a copy of the DataFrame with the test set already assigned
        df_copy = df.copy()

        # Store the DataFrame in the dictionary with a key based on the ratio and seed
        dataset_key = f'df_ratio_{male_ratio}_{female_ratio}_seed_{seed}'
        dfs[dataset_key] = df_copy

        # Create validation set (157 positive, 378 negative)
        validation_set_name = 'val'
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'M', 1, 1, int(157 * (male_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'F', 1, 1, int(157 * (female_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'M', 0, 0, int(378 * (male_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)[0]
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'F', 0, 0, int(378 * (female_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)[0]

        # Create training set (1255 positive, 3023 negative)
        training_set_name = 'train'
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'M', 1, 1, int(1255 * (male_ratio / 100)), training_set_name, seed)
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'F', 1, 1, int(1255 * (female_ratio / 100)), training_set_name, seed)
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'M', 0, 0, int(3023 * (male_ratio / 100)), training_set_name, seed)[0]
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'F', 0, 0, int(3023 * (female_ratio / 100)), training_set_name, seed)[0]

        # Save the generated DataFrame to a CSV file
        output_path = os.path.join(parent, f'no_finding/metadata/aggregate_sex_nf/aggregate_sex_nf_seed_{seed}/sex_{male_ratio}_seed_{seed}.csv')
        dfs[dataset_key].to_csv(output_path, index=False)


test set for label Cardiomegaly does not meet age balance. Old: 0, Young: 9
Conditions not met for test. Changing seed to 11 and retrying... (Attempt 1/2000)
Conditions met for test with seed 11.
test set for label Cardiomegaly does not meet age balance. Old: 0, Young: 15
Conditions not met for test. Changing seed to 11 and retrying... (Attempt 1/2000)
test set for label Cardiomegaly does not meet age balance. Old: 0, Young: 12
Conditions not met for test. Changing seed to 12 and retrying... (Attempt 2/2000)
Conditions met for test with seed 12.

Diagnostic Label Counts for test Set:
Gender Group: M
Cardiomegaly      14.0
Consolidation     17.0
Infiltration     102.0
Mass/Nodule       42.0
Pneumonia         53.0
dtype: float64
Gender Group: F
Cardiomegaly       8.0
Consolidation     18.0
Infiltration     121.0
Mass/Nodule       29.0
Pneumonia         59.0
dtype: float64
Skipping val assignment for M as the ratio is 0.
Conditions met for val with seed 2358.
Skipping train assignment for

In [5]:


# Loop through each ratio and print the diagnostic label counts for test, val, and train sets
for seed_idx, seed in enumerate(predefined_seeds):
    for ratio_idx, (male_ratio, female_ratio) in enumerate(ratios):
        dataset_key = f'df_ratio_{male_ratio}_{female_ratio}_seed_{seed}'
        print(f"\nRatio {male_ratio}:{female_ratio}, Seed {seed}\n")
        
        # Print counts for test
        print_label_counts(dfs[dataset_key], 'test')
        
        # Print counts for validation set
        print_label_counts(dfs[dataset_key], 'val')
        
        # Print counts for training set
        print_label_counts(dfs[dataset_key], 'train')


Ratio 0:100, Seed 2358


Diagnostic Label Counts for test Set:
Gender Group: M
Cardiomegaly      14.0
Consolidation     17.0
Infiltration     102.0
Mass/Nodule       42.0
Pneumonia         53.0
dtype: float64
Gender Group: F
Cardiomegaly       8.0
Consolidation     18.0
Infiltration     121.0
Mass/Nodule       29.0
Pneumonia         59.0
dtype: float64

Diagnostic Label Counts for val Set:
Gender Group: M
Cardiomegaly     0.0
Consolidation    0.0
Infiltration     0.0
Mass/Nodule      0.0
Pneumonia        0.0
dtype: float64
Gender Group: F
Cardiomegaly      12.0
Consolidation     13.0
Infiltration     115.0
Mass/Nodule       43.0
Pneumonia         63.0
dtype: float64

Diagnostic Label Counts for train Set:
Gender Group: M
Cardiomegaly     0.0
Consolidation    0.0
Infiltration     0.0
Mass/Nodule      0.0
Pneumonia        0.0
dtype: float64
Gender Group: F
Cardiomegaly     105.0
Consolidation    166.0
Infiltration     872.0
Mass/Nodule      281.0
Pneumonia        301.0
dtype: float64

R

In [6]:
import pandas as pd
fives = ['0', '25', '50', '75', '100']
for seed in predefined_seeds:
    for five in fives:
        # Load the data
        file_path = parent + f'no_finding/metadata/aggregate_sex_nf/aggregate_sex_nf_seed_{seed}/sex_{five}_seed_{seed}.csv' 
        data = pd.read_csv(file_path)

        # Define the disease labels for summing up their counts
        disease_labels = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']

        # Group the data by 'Age Group' and 'Set', sum the counts for each disease label, and count the total images
        grouped_counts_diseases = data.groupby(['Patient Gender', 'Set']).agg(
            {**{label: 'sum' for label in disease_labels}, 'Image Index': 'count'}).reset_index()

        # Rename 'Image Index' column to 'Total Images'
        grouped_counts_diseases.rename(columns={'Image Index': 'Total Images'}, inplace=True)

        # Export to CSV
        output_file_path = parent + f'no_finding/experiment/metadata_count/seed{seed}/sex_{five}_seed_{seed}.csv.csv'  # Specify your desired output file name and path
        grouped_counts_diseases.to_csv(output_file_path, index=False)

        print(f"Data exported successfully to {output_file_path}")

Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/sex_0_seed_2358.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/sex_25_seed_2358.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/sex_50_seed_2358.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/sex_75_seed_2358.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/sex_100_seed_2358.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed4563/sex_0_seed_4563.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed4563/sex_25_seed_4563.csv.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_c