In [48]:
import pandas as pd
parent = '/home/jkim/research/peds_cxr/'

base = pd.read_csv(parent + 'no_finding/metadata/base_metadata.csv') 

# Filter patient_ids that have both 'Young' and 'Old' in 'Age Group'
# First, identify all patient_ids for each 'Age Group'
young_patient_ids = set(base[base['Age Group'] == 'young']['Patient_id'])
old_patient_ids = set(base[base['Age Group'] == 'old']['Patient_id'])

# Find patient_ids that appear in both sets
common_patient_ids = young_patient_ids.intersection(old_patient_ids)

# Convert to list and sort for presentation
patient_ids_to_check = sorted(list(common_patient_ids))

# Display the sorted list of common patient_ids
print(patient_ids_to_check)
print(len(patient_ids_to_check))
print(base[base['Patient_id'].isin(patient_ids_to_check)]['Age Group'].value_counts())

df1 = base[(base['Patient_id'].isin(patient_ids_to_check)) & (base['Age Group'] == 'young')]
df1.to_csv(parent + 'no_finding/experiment/young_intersect.csv', index=False)
# Drop rows where 'Patient_id' is in the specified list AND 'Age Group' is 'young'
df = base.drop(base[(base['Patient_id'].isin(patient_ids_to_check)) & (base['Age Group'] == 'young')].index)

# Create Positive Label adn Patient Patient column
positive_conditions = ['Pneumonia', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule'] 
df['Positive_Label'] = df[positive_conditions].any(axis=1).astype(int)
positive_patients = df[df['Positive_Label'] == 1]['Patient_id'].unique()
df['Positive_Patient'] = df['Patient_id'].isin(positive_patients).astype(int)

# Save
df.to_csv(parent + 'no_finding/metadata/base_metadata_age.csv', index=False)

# Display the shape of the original and the filtered dataframe to see the number of rows dropped
print("Original DataFrame shape:", base.shape)
print("Filtered DataFrame shape:", df.shape)

['00002555', '00003074', '00004028', '00004472', '00005065', '00005146', '00005724', '00005834', '00005864', '00006180', '00006294', '00007087', '00010152', '00010360', '00010431', '00010460', '00011689', '00012654', '00014323', '00014352', '00014474', '00015553', '00015996', '00016054', '00016484', '00017538', '00018251', '00018445', '00018458', '00018778', '00019045', '00021571', '00029174']
33
old      85
young    68
Name: Age Group, dtype: int64
Original DataFrame shape: (12924, 14)
Filtered DataFrame shape: (12856, 14)


In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split

positive_patients_df = df[df['Positive_Patient'] == 1].drop_duplicates(subset='Patient_id')
result = positive_patients_df.groupby(['Age Group', 'Patient Gender']).size().reset_index(name='Number of Positive Patients')
print(result)

negative_patients_df = df[df['Positive_Patient'] == 0].drop_duplicates(subset='Patient_id')
negative_result = negative_patients_df.groupby(['Age Group', 'Patient Gender']).size().reset_index(name='Number of Negative Patients')
print(negative_result)

diagnostic_labels = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']
grouped_counts = df.groupby(['Age Group', 'Patient Gender'])[diagnostic_labels].sum()
print(grouped_counts)

positive_label_counts = df.groupby(['Age Group', 'Patient Gender'])['Positive_Label'].sum()
print(positive_label_counts)


  Age Group Patient Gender  Number of Positive Patients
0       old              F                          213
1       old              M                          309
2     young              F                          947
3     young              M                         1450
  Age Group Patient Gender  Number of Negative Patients
0       old              F                          656
1       old              M                          856
2     young              F                         2252
3     young              M                         3139
                          No Finding  Cardiomegaly  Consolidation  \
Age Group Patient Gender                                            
old       F                     1491            45            105   
          M                     1921            98            165   
young     F                     2288            80             90   
          M                     3177           145            142   

                         

In [51]:
import pandas as pd
import random
import os

# Load the CSV file
file_path = parent + 'no_finding/metadata/base_metadata_age.csv'
df = pd.read_csv(file_path)

# Predefined seeds to create different splits
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]

# Define the ratios and diagnostic labels
ratios = [(0, 100), (25, 75), (50, 50), (75, 25), (100, 0)]
diagnostic_labels = ['Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']

# Function to calculate and print diagnostic label counts for each set
def print_label_counts(df, set_name):
    print(f"\nDiagnostic Label Counts for {set_name} Set:")
    for age_group in ['old', 'young']:
        print(f"Age Group: {age_group}")
        df_set = df[(df['Set'].str.contains(set_name)) & (df['Age Group'] == age_group)]
        label_counts = df_set[diagnostic_labels].sum()
        print(label_counts)

# Helper function to assign patients to a given set
def assign_patients_to_set(df, age_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient=None):
    df_age_group = df[(df['Age Group'] == age_group) & (df['Set'] == '')]

    # Filter for patients with specified positive or negative label
    if positive_patient == 1:
        patients = df_age_group[df_age_group['Positive_Patient'] == positive_patient].groupby('Patient_id').filter(lambda x: len(x) <= 3 if max_images_per_patient else True)
    else:
        patients = df_age_group[(df_age_group['Positive_Label'] == positive_label) & (df_age_group['Positive_Patient'] == positive_patient)].groupby('Patient_id').filter(lambda x: len(x) <= 3 if max_images_per_patient else True)

    unique_patients = patients['Patient_id'].unique()
    
    # Shuffle patients
    random.seed(seed)
    random.shuffle(unique_patients)
    
    count = 0  # Track the number of images assigned
    assigned_patients = set()
    
    for patient_id in unique_patients:
        patient_rows = df[df['Patient_id'] == patient_id]
        if positive_patient == 1:
            label_count = patient_rows['Positive_Label'].sum()  # Count positive images
        else:
            label_count = len(patient_rows[patient_rows['Positive_Label'] == positive_label])  # Count negative images

        # Check if adding this patient exceeds the total count limit
        if count + label_count <= total_count:
            df.loc[df['Patient_id'] == patient_id, 'Set'] = set_name  # Assign patient to the set
            count += label_count
            assigned_patients.add(patient_id)  # Keep track of assigned patients

        if count >= total_count:
            break

    return df, assigned_patients

# Function to check if each diagnostic label in each set contains at least 2 male and 2 female images
def check_gender_balance_per_label(df, set_name, min_count):
    for label in diagnostic_labels:
        df_filtered = df[(df['Set'].str.contains(set_name)) & (df['Positive_Label'] == 1) & (df[label] == 1)]
        male_count = df_filtered[df_filtered['Patient Gender'] == 'M'].shape[0]
        female_count = df_filtered[df_filtered['Patient Gender'] == 'F'].shape[0]
        if male_count < min_count or female_count < min_count:
            print(f"{set_name} set for label {label} does not meet gender balance. M: {male_count}, F: {female_count}")
            return False
    return True

# Function to perform assignments with checking conditions and changing seeds if conditions aren't met
def assign_with_conditions(df, age_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient=None, min_count=2, retry_limit=1000):
    # Skip the function entirely if the ratio for this age group is 0
    if (age_group == 'old' and total_count == 0) or (age_group == 'young' and total_count == 0):
        print(f"Skipping {set_name} assignment for {age_group} as the ratio is 0.")
        return df

    retry_count = 0
    while retry_count < retry_limit:  # Retry until conditions are met or retry limit is reached
        df_temp = df[df['Set'] == ''].copy()  # Work with a copy of the original DataFrame

        # Perform the assignment
        df_temp, _ = assign_patients_to_set(df_temp, age_group, positive_patient, positive_label, total_count, set_name, seed, max_images_per_patient)

        # Check if the assigned set meets the gender balance condition per diagnostic label
        if check_gender_balance_per_label(df_temp, set_name, min_count):
            # Only update the original DataFrame where 'Set' is still empty
            df.loc[df_temp.index[df_temp['Set'] == set_name], 'Set'] = set_name
            print(f"Conditions met for {set_name} with seed {seed}, {positive_label}")
            return df
        else:
            retry_count += 1
            # Change to a new random seed
            seed = seed + 1
            print(f"Conditions not met for {set_name}. Changing seed to {seed} and retrying... (Attempt {retry_count}/{retry_limit})")

    # If retry limit is reached, raise an error to stop the code
    error_message = f"Retry limit of {retry_limit} reached for {set_name}. The last seed {seed} did not meet the requirement."
    print(error_message)
    raise RuntimeError(error_message)

# Create the datasets using the new function
dfs = {}  # Dictionary to hold the generated DataFrames

# First, assign the test set (this will remain the same across all splits)
df['Set'] = ''  # Reset the 'Set' column
df = assign_with_conditions(df, 'old', 1, 1, 155, 'test', seed=10, max_images_per_patient=3)
df = assign_with_conditions(df, 'young', 1, 1, 155, 'test', seed=10, max_images_per_patient=3)
df = assign_patients_to_set(df, 'old', 0, 0, 341, 'test', seed=10, max_images_per_patient=3)[0]
df = assign_patients_to_set(df, 'young', 0, 0, 341, 'test', seed=10, max_images_per_patient=3)[0]

print_label_counts(df, 'test')

# Now, iterate over each ratio and seed to create 125 datasets
for seed_idx, seed in enumerate(predefined_seeds):
    for ratio_idx, (old_ratio, young_ratio) in enumerate(ratios):
        print(f'{seed}, old ratio {old_ratio}')
        # Create a copy of the DataFrame with the test set already assigned
        df_copy = df.copy()

        # Store the DataFrame in the dictionary with a key based on the ratio and seed
        dataset_key = f'df_ratio_{old_ratio}_{young_ratio}_seed_{seed}'
        dfs[dataset_key] = df_copy

        # Create validation set (155 positive, 341 negative)
        validation_set_name = 'val'
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'old', 1, 1, int(155 * (old_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'young', 1, 1, int(155 * (young_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'old', 0, 0, int(341 * (old_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)[0]
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'young', 0, 0, int(341 * (young_ratio / 100)), validation_set_name, seed, max_images_per_patient=3)[0]

        # Create training set (1240 positive, 2729 negative)
        training_set_name = 'train'
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'old', 1, 1, int(1240 * (old_ratio / 100)), training_set_name, seed)
        dfs[dataset_key] = assign_with_conditions(dfs[dataset_key], 'young', 1, 1, int(1240 * (young_ratio / 100)), training_set_name, seed)
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'old', 0, 0, int(2729 * (old_ratio / 100)), training_set_name, seed)[0]
        dfs[dataset_key] = assign_patients_to_set(dfs[dataset_key], 'young', 0, 0, int(2729 * (young_ratio / 100)), training_set_name, seed)[0]

        # Save the generated DataFrame to a CSV file
        output_path = os.path.join(parent, f'no_finding/metadata/aggregate_age_nf/aggregate_age_nf_seed_{seed}/age_{old_ratio}_seed_{seed}.csv')
        dfs[dataset_key].to_csv(output_path, index=False)


Conditions met for test with seed 10, 1
Conditions met for test with seed 10, 1

Diagnostic Label Counts for test Set:
Age Group: old
Cardiomegaly      11
Consolidation     19
Infiltration     105
Mass/Nodule       29
Pneumonia         20
dtype: int64
Age Group: young
Cardiomegaly      14
Consolidation     11
Infiltration     109
Mass/Nodule       45
Pneumonia         53
dtype: int64
2358, old ratio 0
Skipping val assignment for old as the ratio is 0.
Conditions met for val with seed 2358, 1
Skipping train assignment for old as the ratio is 0.
Conditions met for train with seed 2358, 1
2358, old ratio 25
val set for label Cardiomegaly does not meet gender balance. M: 0, F: 1
Conditions not met for val. Changing seed to 2359 and retrying... (Attempt 1/1000)
val set for label Cardiomegaly does not meet gender balance. M: 1, F: 3
Conditions not met for val. Changing seed to 2360 and retrying... (Attempt 2/1000)
val set for label Cardiomegaly does not meet gender balance. M: 0, F: 2
Condit

In [52]:
diagnostic_labels = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']

# Function to calculate and print diagnostic label counts for each set
def print_label_counts(df, set_name):
    print(f"\nDiagnostic Label Counts for {set_name} Set:")
    for age_group in ['old', 'young']:
        print(f"Age Group: {age_group}")
        df_set = df[(df['Set'].str.contains(set_name)) & (df['Age Group'] == age_group)]
        label_counts = df_set[diagnostic_labels].sum()
        print(label_counts)

# Loop through each ratio and print the diagnostic label counts for test, val, and train sets
for seed_idx, seed in enumerate(predefined_seeds):
    for ratio_idx, (old_ratio, young_ratio) in enumerate(ratios):
        dataset_key = f'df_ratio_{old_ratio}_{young_ratio}_seed_{seed}'
        print(f"\nRatio {old_ratio}:{young_ratio}, Seed {seed}\n")
        
        # Print counts for test_old and test_young
        print_label_counts(dfs[dataset_key], 'test')
        
        # Print counts for validation set
        print_label_counts(dfs[dataset_key], 'val')
        
        # Print counts for training set
        print_label_counts(dfs[dataset_key], 'train')


Ratio 0:100, Seed 2358


Diagnostic Label Counts for test Set:
Age Group: old
No Finding       403
Cardiomegaly      11
Consolidation     19
Infiltration     105
Mass/Nodule       29
Pneumonia         20
dtype: int64
Age Group: young
No Finding       341
Cardiomegaly      14
Consolidation     11
Infiltration     109
Mass/Nodule       45
Pneumonia         53
dtype: int64

Diagnostic Label Counts for val Set:
Age Group: old
No Finding       0.0
Cardiomegaly     0.0
Consolidation    0.0
Infiltration     0.0
Mass/Nodule      0.0
Pneumonia        0.0
dtype: float64
Age Group: young
No Finding       341
Cardiomegaly      14
Consolidation      8
Infiltration     113
Mass/Nodule       37
Pneumonia         68
dtype: int64

Diagnostic Label Counts for train Set:
Age Group: old
No Finding       0.0
Cardiomegaly     0.0
Consolidation    0.0
Infiltration     0.0
Mass/Nodule      0.0
Pneumonia        0.0
dtype: float64
Age Group: young
No Finding       2761
Cardiomegaly      122
Consolidation     1

In [53]:
import pandas as pd
import os
fives = ['0', '25', '50', '75', '100']
for seed in predefined_seeds:
    for five in fives:
        # Load the data
        file_path = parent + f'no_finding/metadata/aggregate_age_nf/aggregate_age_nf_seed_{seed}/age_{five}_seed_{seed}.csv' 
        if not os.path.exists(parent + f'no_finding/experiment/metadata_count/seed{seed}/'):
            os.makedirs(parent + f'no_finding/experiment/metadata_count/seed{seed}/', exist_ok=True)
        
        data = pd.read_csv(file_path)

        # Define the disease labels for summing up their counts
        disease_labels = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']

        # Group the data by 'Age Group' and 'Set', sum the counts for each disease label, and count the total images
        grouped_counts_diseases = data.groupby(['Age Group', 'Set']).agg(
            {**{label: 'sum' for label in disease_labels}, 'Image Index': 'count'}).reset_index()
            
        # Rename 'Image Index' column to 'Total Images'
        grouped_counts_diseases.rename(columns={'Image Index': 'Total Images'}, inplace=True)

        # Export to CSV
        output_file_path = parent + f'no_finding/experiment/metadata_count/seed{seed}/age_{five}_seed_{seed}.csv'  
        grouped_counts_diseases.to_csv(output_file_path, index=False)

        print(f"Data exported successfully to {output_file_path}")


Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/age_0_seed_2358.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/age_25_seed_2358.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/age_50_seed_2358.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/age_75_seed_2358.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed2358/age_100_seed_2358.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed4563/age_0_seed_4563.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed4563/age_25_seed_4563.csv
Data exported successfully to /home/jkim/research/peds_cxr/no_finding/experiment/metadata_count/seed4563/age_50_seed_45