In [1]:
# aggregate sex count
import pandas as pd

def calculate_gender_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Fill empty gender cells with 'O' for unspecified
    data['Patient Gender'] = data['Patient Gender'].replace({'': 'O', ' ': 'O'})

    # List of labels and dataset sources
    labels = [
        'No Finding', 'Cardiomegaly', 'Consolidation',
        'Infiltration', 'Mass/Nodule', 'Pneumonia'
    ]
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, label): {'M': 0, 'F': 0, 'O': 0, 'Total': 0} for dataset in dataset_sources for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]:  # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            gender = row['Patient Gender']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, label)][gender] += 1
                    counts[(dataset, label)]['Total'] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        gender = row['Patient Gender']
        for label in labels:
            if row[label] == 1:
                counts[('All', label)][gender] += 1
                counts[('All', label)]['Total'] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Label': label, **gender_counts}
                   for (dataset, label), gender_counts in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/aggregate_sex_count.csv'
    counts_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/aggregate/aggregate_metadata_seed_42.csv'
summary_file_path = calculate_gender_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/aggregate_sex_count.csv


In [None]:

or seed in predefined_seeds:
    np.random.seed(seed)

    # Set the number of images for train and validation sets
    train_size = 3970
    val_size = 496
    conditions_labels = ['Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']
    no_finding_label = 'No Finding'

    # Split data into old and young groups
    old_data = data[data['Age Group'] == 'old']
    young_data = data[data['Age Group'] == 'young']

    for ratio_old, ratio_name in zip(ratios_old, ratio_names):
        # Reset dataset excluding test set
        data = reset_dataset_for_new_seed(data)
        # Allocate data for specified conditions and 'No Finding'
        train_old, val_old, used_patients = allocate_data(old_data, conditions_labels, no_finding_label, int(train_size * ratio_old), int(val_size * ratio_old), train_ratio)
        train_young, val_young, used_patients = allocate_data(young_data, conditions_labels, no_finding_label, train_size - len(train_old), val_size - len(val_old), train_ratio)
        
        # Combine old and young allocations
        train_data = pd.concat([train_old, train_young])
        val_data = pd.concat([val_old, val_young])
        
        # Set 'Set' column for allocated data
        data.loc[train_data.index, 'Set'] = 'train'
        data.loc[val_data.index, 'Set'] = 'val'

        # Save the dataset
        file_name = f'aggregate_age_nf_seed_{seed}_ratio_{ratio_name}.csv'
        data.to_csv(file_name, index=False)
        print(f"Dataset for seed {seed}, ratio {ratio_name}% old saved to {file_name}")f

In [2]:
# Aggregate age count
import pandas as pd

def calculate_age_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Function to categorize age
    def categorize_age(age):
        if pd.isnull(age) or age == 'mis' or not str(age).isdigit():
            return 'Unspecified'
        age = int(age)
        if age <= 5:
            return '0-5'
        elif 6 <= age <= 18:
            return '6-18'
        else:
            return 'Unspecified'

    # Apply the age categorization function
    data['Age Group'] = data['Patient Age'].apply(categorize_age)

    # List of labels and age groups
    labels = [
        'No Finding', 'Cardiomegaly', 'Consolidation',
        'Infiltration', 'Mass/Nodule', 'Pneumonia'
    ]
    age_groups = ['0-5', '6-18', 'Unspecified']
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, age_group, label): 0 for dataset in dataset_sources for age_group in age_groups for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]: # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            age_group = row['Age Group']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, age_group, label)] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        age_group = row['Age Group']
        for label in labels:
            if row[label] == 1:
                counts[('All', age_group, label)] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Age Group': age_group, 'Label': label, 'Count': count}
                   for (dataset, age_group, label), count in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Pivot the DataFrame
    pivot_df = counts_df.pivot_table(index=['Dataset', 'Label'], columns='Age Group', values='Count').fillna(0).astype(int)
    pivot_df.reset_index(inplace=True)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/aggregate_age_count.csv'
    pivot_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/aggregate/aggregate_metadata_seed_42.csv'
summary_file_path = calculate_age_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/aggregate_age_count.csv


In [3]:
# NIH SEX COUNT
import pandas as pd

def calculate_gender_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Fill empty gender cells with 'O' for unspecified
    data['Patient Gender'] = data['Patient Gender'].replace({'': 'O', ' ': 'O'})

    # List of labels and dataset sources
    labels = [
        'No Finding', 'Cardiomegaly', 'Consolidation',
        'Infiltration', 'Mass/Nodule', 'Pneumonia'
    ]
    
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, label): {'M': 0, 'F': 0, 'O': 0, 'Total': 0} for dataset in dataset_sources for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]:  # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            gender = row['Patient Gender']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, label)][gender] += 1
                    counts[(dataset, label)]['Total'] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        gender = row['Patient Gender']
        for label in labels:
            if row[label] == 1:
                counts[('All', label)][gender] += 1
                counts[('All', label)]['Total'] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Label': label, **gender_counts}
                   for (dataset, label), gender_counts in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/nih_sex_count.csv'
    counts_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/nih/nih_metadata_seed_42.csv'
summary_file_path = calculate_gender_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/nih_sex_count.csv


In [4]:
# NIH Age Count
import pandas as pd

def calculate_age_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Function to categorize age
    def categorize_age(age):
        if pd.isnull(age) or age == 'mis' or not str(age).isdigit():
            return 'Unspecified'
        age = int(age)
        if age <= 5:
            return '0-5'
        elif 6 <= age <= 18:
            return '6-18'
        else:
            return 'Unspecified'

    # Apply the age categorization function
    data['Age Group'] = data['Patient Age'].apply(categorize_age)

    # List of labels and age groups
    labels = [
        'No Finding', 'Cardiomegaly', 'Consolidation',
        'Infiltration', 'Mass/Nodule', 'Pneumonia'
    ]

    age_groups = ['0-5', '6-18', 'Unspecified']
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, age_group, label): 0 for dataset in dataset_sources for age_group in age_groups for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]: # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            age_group = row['Age Group']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, age_group, label)] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        age_group = row['Age Group']
        for label in labels:
            if row[label] == 1:
                counts[('All', age_group, label)] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Age Group': age_group, 'Label': label, 'Count': count}
                   for (dataset, age_group, label), count in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Pivot the DataFrame
    pivot_df = counts_df.pivot_table(index=['Dataset', 'Label'], columns='Age Group', values='Count').fillna(0).astype(int)
    pivot_df.reset_index(inplace=True)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/nih_age_count.csv'
    pivot_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/nih/nih_metadata_seed_42.csv'
summary_file_path = calculate_age_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/nih_age_count.csv


In [5]:
# vindr SEX COUNT
import pandas as pd

def calculate_gender_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Fill empty gender cells with 'O' for unspecified
    data['Patient Gender'] = data['Patient Gender'].replace({'': 'O', ' ': 'O'})

    # List of labels and dataset sources
    labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, label): {'M': 0, 'F': 0, 'O': 0, 'Total': 0} for dataset in dataset_sources for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]:  # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            gender = row['Patient Gender']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, label)][gender] += 1
                    counts[(dataset, label)]['Total'] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        gender = row['Patient Gender']
        for label in labels:
            if row[label] == 1:
                counts[('All', label)][gender] += 1
                counts[('All', label)]['Total'] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Label': label, **gender_counts}
                   for (dataset, label), gender_counts in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/vindr_sex_count.csv'
    counts_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/vindr/vindr_metadata_seed_42.csv'
summary_file_path = calculate_gender_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/vindr_sex_count.csv


In [6]:
# vindr Age Count
import pandas as pd

def calculate_age_distribution_by_dataset(file_path, parent):
    # Load the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Function to categorize age
    def categorize_age(age):
        if pd.isnull(age) or age == 'mis' or not str(age).isdigit():
            return 'Unspecified'
        age = int(age)
        if age <= 5:
            return '0-5'
        elif 6 <= age <= 18:
            return '6-18'
        else:
            return 'Unspecified'

    # Apply the age categorization function
    data['Age Group'] = data['Patient Age'].apply(categorize_age)

    # List of labels and age groups
    labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]
    age_groups = ['0-5', '6-18', 'Unspecified']
    dataset_sources = data['Dataset'].unique().tolist() + ['All']

    # Initialize a dictionary to hold the counts
    counts = {
        (dataset, age_group, label): 0 for dataset in dataset_sources for age_group in age_groups for label in labels
    }

    # Iterate over each dataset source and count
    for dataset in dataset_sources[:-1]: # Exclude 'All'
        subset = data[data['Dataset'] == dataset]
        for index, row in subset.iterrows():
            age_group = row['Age Group']
            for label in labels:
                if row[label] == 1:
                    counts[(dataset, age_group, label)] += 1

    # Count for 'All'
    for index, row in data.iterrows():
        age_group = row['Age Group']
        for label in labels:
            if row[label] == 1:
                counts[('All', age_group, label)] += 1

    # Convert the dictionary to a DataFrame
    counts_list = [{'Dataset': dataset, 'Age Group': age_group, 'Label': label, 'Count': count}
                   for (dataset, age_group, label), count in counts.items()]
    counts_df = pd.DataFrame(counts_list)

    # Pivot the DataFrame
    pivot_df = counts_df.pivot_table(index=['Dataset', 'Label'], columns='Age Group', values='Count').fillna(0).astype(int)
    pivot_df.reset_index(inplace=True)

    # Save the summary DataFrame to a new CSV file
    summary_file_path = parent + '25variation/manuscript/vindr_age_count.csv'
    pivot_df.to_csv(summary_file_path, index=False)

    return summary_file_path

# Usage example
parent = '/home/jkim/research/peds_cxr/'
file_path = parent + '25variation/metadata/final/vindr/vindr_metadata_seed_42.csv'
summary_file_path = calculate_age_distribution_by_dataset(file_path, parent)
print(f"The summary has been saved to {summary_file_path}")


The summary has been saved to /home/jkim/research/peds_cxr/25variation/manuscript/vindr_age_count.csv


In [7]:
# Intersectional Analysis 
