In [None]:
# Training 25 aggregate models 
import pandas as pd 
import os 
import numpy as np
import subprocess

# File paths and setup
parent = '/home/jkim/research/peds_cxr/'
# Predefined 25 random seeds
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]

### set source
metadata_source = parent + 'no_finding/metadata/aggregate_age_nf/' 

for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        image_source = parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}'
        # Load the data from the CSV file into a pandas DataFrame
        aggregate_metadata_path = metadata_source + f'aggregate_age_nf_seed_{seed}/age_{five}_seed_{seed}.csv'
        aggregate_metadata = pd.read_csv(aggregate_metadata_path)

        # Creating image_paths.txt
        # Filter rows where 'Set' column is not "not assigned"
        filtered_data = aggregate_metadata[aggregate_metadata['Set'] != "not assigned"]
        image_path = filtered_data['Image Index']
        output_image_txt = image_source + '/image_paths.txt'
        image_path.to_csv(output_image_txt, sep=',', index=False, header=False)

        # Creating train.txt 
        train_index = aggregate_metadata[aggregate_metadata['Set']=='train']
        aggregate_train_output = image_source + '/train.txt'
        aggregate_train_txt = train_index['Image Index'] 
        aggregate_train_txt.to_csv(aggregate_train_output, sep=',', index=False, header=False)

        # Creating val.txt 
        val_index = aggregate_metadata[aggregate_metadata['Set']=='val']
        aggregate_val_output = image_source + '/val.txt'
        aggregate_val_txt = val_index['Image Index'] 
        aggregate_val_txt.to_csv(aggregate_val_output, sep=',', index=False, header=False)

        # Creating test.txt
        test_index = aggregate_metadata[aggregate_metadata['Set']=='test']
        aggregate_test_output = image_source + '/aggregate_test.txt'
        aggregate_test_txt = test_index['Image Index'] 
        aggregate_test_txt.to_csv(aggregate_test_output, sep=',', index=False, header=False)

        # Creating labels.csv
        aggregate_labels_output = image_source + '/labels.csv'
        aggregate_labels = filtered_data[['Image Index', 'No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']]
        # rename Image Index to Image
        aggregate_labels = aggregate_labels.rename(columns={'Image Index': 'Image'})
        aggregate_labels.to_csv(aggregate_labels_output, sep=',', index=False)

        # confirming the number of train val and test. 
        print(f'age, seed: {seed}, {five}%')
        print("# of train:",len(aggregate_train_txt))
        print('# of val:',len(aggregate_val_txt))
        print('# of test:',len(aggregate_test_txt))
        print('# of image path:',len(image_path))
        print('# of labels:',len(aggregate_labels))


In [None]:
# Create json file. 

import json
import os

# Loop through each seed to generate JSON files
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        json_template = {
        "data_dir": f"/home/jkim/research/peds_cxr/no_finding/images/age/{seed}/",
        "labels": ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule' , 'Pneumonia'],
        "dataset_nicenames": {
            "nihcxr14_test": "NIH CXR14",
            "chexpert_test": "CheXpert",
            "mimic_test": "MIMIC",
            "padchest_test": "PadChest",
            "rsna_pneumonia_test_100": "RSNA Pneumonia",
            "rsna_pneumonia_DeiT-B_test_100": "RSNA Pneumonia DeiT-B",
            "Aggregate_DenseNet121_aggregate_test": "aggregate DenseNet121",
            "Aggregate_DenseNet121_aggregate_test": "Aggregate DenseNet121"
            }
        }
        
        # Construct the new directory path
        new_data_dir = f"/home/jkim/research/peds_cxr/no_finding/images/age/{seed}/age_{five}%_seed{seed}/"
        
        # Create the new directory if it doesn't exist
        if not os.path.exists(new_data_dir):
            print(f"The directory {new_data_dir} does not exist. Creating it.")
            os.makedirs(new_data_dir)

        # Specify the path for the new JSON file
        new_json_path = os.path.join(new_data_dir, f"cfg_aggregate_{seed}.json")

        # Write the JSON file
        with open(new_json_path, 'w') as f:
            json.dump(json_template, f, indent=4)
        
        print(f"Created new JSON file at {new_json_path}")

# batch creating result directiory
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        # Construct the new directory path for results
        new_result_dir = f"/home/jkim/research/peds_cxr/no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/"
        
        # Create the new directory if it doesn't exist
        if not os.path.exists(new_result_dir):
            os.makedirs(new_result_dir)

In [None]:

predefined_seeds = [84910, 89357, 93450, 97910]
#training 25 aggregate models
optimizers = ['SGD']
initial_lrs = [0.01]
weight_decays = [1e-5]
dropouts = [0.5]
gpu='0' 

parent = '/home/jkim/research/peds_cxr/'
import subprocess
    
# Iterate over the hyperparameters
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:    
        for optimizer in optimizers:
            for initial_lr in initial_lrs:
                for weight_decay in weight_decays:
                    for dropout in dropouts:
                        # Construct the command with updated hyperparameters
                        command = [
                            'python3',
                            parent + 'transformer-radiographs/train_cxr.py',
                            '--cfg-dir',
                            parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}/cfg_aggregate_{seed}.json',
                            '--dataset',
                            f'age_{five}%_seed{seed}',
                            '--labels-set',
                            'labels',
                            '--architecture',
                            'DenseNet121',
                            '--results-dir',
                            parent + f'no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/',
                            '--optimizer-family',
                            optimizer,
                            '--dropout',
                            str(dropout),
                            '--weight-decay',
                            str(weight_decay),
                            '--initial-lr',
                            str(initial_lr),
                            '--drop-factor',
                            '0.1',
                            '--plateau-patience',
                            '3',
                            '--plateau-threshold',
                            '1e-4',
                            '--break-patience',
                            '5',
                            '--train-transform',
                            'peds',
                            '--train-file',
                            'train.txt',
                            '--val-file',
                            'val.txt',
                            '--use-gpus',
                            gpu
                        ]

                        # Execute the command
                        subprocess.run(command)

In [8]:
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]


# testing 25 aggregate models - DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_1705144996.txt
import subprocess
import pandas as pd 
import os
import glob
# batch creating result directiory
for seed in predefined_seeds:
    for dir in ['all', 'young', 'old']:  
        for five in ['0', '25', '50', '75', '100']:
            # Construct the new directory path for results
            new_result_dir = parent + f"no_finding/results/age/test/{dir}/{seed}/age_{five}_{seed}_{dir}/"

            # Create the new directory if it doesn't exist
            if not os.path.exists(new_result_dir):
                os.makedirs(new_result_dir)

for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        aggregate_metadata_path = parent + f'no_finding/metadata/aggregate_age_nf/aggregate_age_nf_seed_{seed}/age_{five}_seed_{seed}.csv'
        aggregate_metadata = pd.read_csv(aggregate_metadata_path)

        # Creating aggregate_test_young / old .txt
        ages = ['young', 'old']
        for age in ages:
            test_index = aggregate_metadata[(aggregate_metadata['Set'] == 'test') & (aggregate_metadata['Age Group'] == age)]
            image_source = parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}/'
            aggregate_test_output = image_source + 'aggregate_test_'+ age +'.txt'
            aggregate_test_txt = test_index['Image Index']
            aggregate_test_txt.to_csv(aggregate_test_output, sep=',', index=False, header=False)

In [None]:
# testing 25 aggregate models - DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_1705144996.txt
import os
import subprocess
import pandas as pd 
import glob


ages = ['old', 'young']
# Testing old vs young
for age in ages:
    for seed in predefined_seeds:
        for five in ['0', '25', '50', '75', '100']:
        # Build the pattern to search for model files
            search_pattern = os.path.join(parent, f"no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_*_model.pt")
            
            # Use glob to find the file
            matching_files = glob.glob(search_pattern)
            
            # Check if any matching files are found
            if not matching_files:
                print(f"No matching model files found for seed {seed}.")
                continue  # Skip to the next iteration

            # Assuming there's only one matching file, take the first one
            model_file_path = matching_files[0]

            image_source = parent + f"no_finding/images/age/{seed}/age_{five}%_seed{seed}/"
            command = [
                "python3",
                parent + "transformer-radiographs/test_cxr.py",
                "--cfg-dir", image_source + f"cfg_aggregate_{seed}.json",
                "--dataset", f'age_{five}%_seed{seed}',
                "--labels-set", "labels",
                "--model-state", model_file_path,
                "--model-type", "DenseNet121",
                "--results-dir", parent + f"no_finding/results/age/test/{age}/{seed}/age_{five}_{seed}_{age}/",
                "--test-file", image_source + "aggregate_test_" + age + ".txt",
                "--use-gpus", gpu
        ]

            # Run the command
            try:
                subprocess.run(command, check=True)
            except subprocess.CalledProcessError as e:
                print(f"An error occurred while executing the command: {e}")

In [10]:
import pickle
import pandas as pd
import os 
import glob

# setting directories
parent = '/home/jkim/research/peds_cxr/'

ages = ['old','young']
age_file_name = ['_old', '_young']
for age, age_name in zip(ages, age_file_name):
    for seed in predefined_seeds:
        for five in ['0', '25', '50', '75', '100']:       
            # Build the pattern to search for model files
                search_pattern = os.path.join(parent, f"no_finding/results/age/test/{age}/{seed}/age_{five}_{seed}_{age}/DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_*.pkl")

                # Use glob to find the file
                matching_files = glob.glob(search_pattern)
                
                # Check if any matching files are found
                if not matching_files:
                    print(f"No matching model files found for seed {seed}.")
                    continue  # Skip to the next iteration

                pkl_directory = matching_files[0]

                directory = parent + f'no_finding/results/age/unpickle/{age}/'
                if not os.path.exists(directory):
                    os.makedirs(directory)

                # Open the pickle file and load the data
                with open(pkl_directory, "rb") as f:
                    data = pickle.load(f)

                # Extract the nested dictionary
                nested_dict = data.get(f'age_{five}%_seed{seed}_DenseNet121_aggregate_test_{age}', {})

                # Create a DataFrame from the nested dictionary with keys 'y', 'yhat', and 'file'
                df_nested = pd.DataFrame({
                    "y": nested_dict["y"],
                    "yhat": nested_dict["yhat"],
                    "file": nested_dict["file"]
                })

                # Define the file path
                nested_file_path = directory + f'aggregate_{seed}_{age}_{five}_pkl.csv'

                # Save the DataFrame to a CSV file
                df_nested.to_csv(nested_file_path, index=False)


In [11]:
# Saving Pkl file as csv file 

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
import ast
import os
import matplotlib.pyplot as plt

# Changing pickle file from dictionary format to list format 
ages = ['old', 'young']

# create AUC directory in results
if not os.path.exists(parent + 'no_finding/results/age/analysis/'):
    os.makedirs(parent + 'no_finding/results/age/analysis/')
if not os.path.exists(parent + 'no_finding/results/age/analysis/plot/age'):
    os.makedirs(parent + 'no_finding/results/age/analysis/plot/age')
if not os.path.exists(parent + 'no_finding/results/age/analysis/csv'):
    os.makedirs(parent + 'no_finding/results/age/analysis/csv')

for age in ages: 
    os.makedirs(parent + f'no_finding/results/age/analysis/unpickledcsv/{age}', exist_ok=True)  

for seed in predefined_seeds:        
    for age in ages: 
        for five in ['0', '25', '50', '75', '100']:
            # set directories
            pkl_dir = parent + f'no_finding/results/age/unpickle/{age}/aggregate_{seed}_{age}_{five}_pkl.csv'

            # Loading the pkl file
            aggregate_pkl = pd.read_csv(pkl_dir)

            # Defining the aggregate_label as the list of labels
            aggregate_label = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]

            # Converting the string representation of lists to actual lists
            aggregate_pkl['y'] = aggregate_pkl['y'].apply(ast.literal_eval)
            aggregate_pkl['yhat'] = aggregate_pkl['yhat'].apply(ast.literal_eval)

            # Splitting the one-hot encoded labels and predictions into separate columns
            y_true_df = pd.DataFrame(aggregate_pkl['y'].tolist(), columns=aggregate_label)
            y_pred_df = pd.DataFrame(aggregate_pkl['yhat'].tolist(), columns=aggregate_label)

            # Merging the true labels and predicted probabilities based on index
            result_df = pd.concat([y_true_df, y_pred_df.add_suffix('_pred')], axis=1)

            # Saving
            result_df.to_csv(parent + f'no_finding/results/age/analysis/unpickledcsv/{age}/aggregate_{seed}_{age}_{five}_pkl.csv', index=False)


In [None]:
# stats and plotting for all aggregate data 
import os
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, auc
from scipy.stats import sem

# Initialize variables
labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]

def calculate_metrics_and_auroc(df, label):
    true_labels = df[label]
    predicted_scores = df[f"{label}_pred"]
    
    # Calculate AUROC
    auroc = roc_auc_score(true_labels, predicted_scores)

    # calculate optimal threshold using Youden's Index
    fpr, tpr, thresholds = roc_curve(true_labels, predicted_scores)
    youden_index = tpr - fpr
    optimal_idx = np.argmax(youden_index)
    optimal_threshold = thresholds[optimal_idx]

    # Other metrics
    predicted_labels = (predicted_scores >= optimal_threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    ppv = tp / (tp + fp) if tp + fp != 0 else 0
    npv = tn / (tn + fn) if tn + fn != 0 else 0
    f1_score = 2 * (ppv * sensitivity) / (ppv + sensitivity) if ppv + sensitivity != 0 else 0
    fpr = fp / (fp + tn)  
    fnr = fn / (fn + tp)
    
    return [auroc, sensitivity, specificity, ppv, npv, f1_score, fpr, fnr, optimal_threshold]

# Initialize empty DataFrames for storing metrics
cols = ['Seed', 'Label', 'AUROC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1 Score', 'FPR', 'FNR', 'Youden_Threshold'] 
metrics_young_0_df = pd.DataFrame(columns=cols)
metrics_young_25_df = pd.DataFrame(columns=cols)
metrics_young_50_df = pd.DataFrame(columns=cols)
metrics_young_75_df = pd.DataFrame(columns=cols)
metrics_young_100_df = pd.DataFrame(columns=cols)
metrics_old_0_df = pd.DataFrame(columns=cols)
metrics_old_25_df = pd.DataFrame(columns=cols)
metrics_old_50_df = pd.DataFrame(columns=cols)
metrics_old_75_df = pd.DataFrame(columns=cols)
metrics_old_100_df = pd.DataFrame(columns=cols)

# Loop through all seeds
for seed in predefined_seeds:    
    df_young_0 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/young/aggregate_{seed}_young_0_pkl.csv")
    df_young_25 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/young/aggregate_{seed}_young_25_pkl.csv")
    df_young_50 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/young/aggregate_{seed}_young_50_pkl.csv")
    df_young_75 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/young/aggregate_{seed}_young_75_pkl.csv")
    df_young_100 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/young/aggregate_{seed}_young_100_pkl.csv")
    df_old_0 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/old/aggregate_{seed}_old_0_pkl.csv")
    df_old_25 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/old/aggregate_{seed}_old_25_pkl.csv")
    df_old_50 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/old/aggregate_{seed}_old_50_pkl.csv")
    df_old_75 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/old/aggregate_{seed}_old_75_pkl.csv")
    df_old_100 = pd.read_csv(parent + f"no_finding/results/age/analysis/unpickledcsv/old/aggregate_{seed}_old_100_pkl.csv")

    for label in labels:
        # Calculate metrics for young and old
        metrics_young_0 = calculate_metrics_and_auroc(df_young_0, label)
        metrics_young_25 = calculate_metrics_and_auroc(df_young_25, label)
        metrics_young_50 = calculate_metrics_and_auroc(df_young_50, label)
        metrics_young_75 = calculate_metrics_and_auroc(df_young_75, label)
        metrics_young_100 = calculate_metrics_and_auroc(df_young_100, label)
        metrics_old_0 = calculate_metrics_and_auroc(df_old_0, label)
        metrics_old_25 = calculate_metrics_and_auroc(df_old_25, label)
        metrics_old_50 = calculate_metrics_and_auroc(df_old_50, label)
        metrics_old_75 = calculate_metrics_and_auroc(df_old_75, label)
        metrics_old_100 = calculate_metrics_and_auroc(df_old_100, label)

        metrics_young_0_df = metrics_young_0_df.append(pd.Series([seed, label] + metrics_young_0, index=cols), ignore_index=True)
        metrics_young_25_df = metrics_young_25_df.append(pd.Series([seed, label] + metrics_young_25, index=cols), ignore_index=True)
        metrics_young_50_df = metrics_young_50_df.append(pd.Series([seed, label] + metrics_young_50, index=cols), ignore_index=True)
        metrics_young_75_df = metrics_young_75_df.append(pd.Series([seed, label] + metrics_young_75, index=cols), ignore_index=True)
        metrics_young_100_df = metrics_young_100_df.append(pd.Series([seed, label] + metrics_young_100, index=cols), ignore_index=True)
        metrics_old_0_df = metrics_old_0_df.append(pd.Series([seed, label] + metrics_old_0, index=cols), ignore_index=True)
        metrics_old_25_df = metrics_old_25_df.append(pd.Series([seed, label] + metrics_old_25, index=cols), ignore_index=True)
        metrics_old_50_df = metrics_old_50_df.append(pd.Series([seed, label] + metrics_old_50, index=cols), ignore_index=True)
        metrics_old_75_df = metrics_old_75_df.append(pd.Series([seed, label] + metrics_old_75, index=cols), ignore_index=True)
        metrics_old_100_df = metrics_old_100_df.append(pd.Series([seed, label] + metrics_old_100, index=cols), ignore_index=True)

# Save the metrics DataFrames to CSV
metrics_young_0_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_0.csv", index=False)
metrics_young_25_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_25.csv", index=False)
metrics_young_50_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_50.csv", index=False)
metrics_young_75_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_75.csv", index=False)
metrics_young_100_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_100.csv", index=False)
metrics_old_0_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_0.csv", index=False)
metrics_old_25_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_25.csv", index=False)
metrics_old_50_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_50.csv", index=False)
metrics_old_75_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_75.csv", index=False)
metrics_old_100_df.to_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_100.csv", index=False)

In [None]:
## Statistical analysis old vs young -> average, CI, and T-test, creation of aggregate_age_stats.csv
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel, t
from statsmodels.stats.multitest import multipletests

# Define paths and labels
parent = '/home/jkim/research/peds_cxr/'
labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]

# Load the CSV files
df_metrics_young_0 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_0.csv")
df_metrics_young_25 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_25.csv")
df_metrics_young_50 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_50.csv")
df_metrics_young_75 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_75.csv")
df_metrics_young_100 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_young_100.csv")
df_metrics_old_0 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_0.csv")
df_metrics_old_25 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_25.csv")
df_metrics_old_50 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_50.csv")
df_metrics_old_75 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_75.csv")
df_metrics_old_100 = pd.read_csv(parent + "no_finding/results/age/analysis/csv/metrics_old_100.csv")

# Function to calculate the mean and 95% CI for a given metric and label
def calc_mean_and_ci(metric_values):
    mean_value = np.mean(metric_values)
    stderr = np.std(metric_values, ddof=1) / np.sqrt(len(metric_values))
    ci = stderr * t.ppf((1 + 0.95) / 2., len(metric_values) - 1)
    return mean_value, (mean_value - ci, mean_value + ci)

# Define a function to perform the comparisons and return a DataFrame with the results
def analyze_metrics(df_list, group_label):
    metrics_to_analyze = ['AUROC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1 Score', 'FPR', 'FNR']
    summary_cols = ['Metric', 'Label', 'Comparison', 'p_value']
    stats_df = pd.DataFrame(columns=summary_cols)
    
    for metric in metrics_to_analyze:
        for label in labels:
            for i in range(len(df_list) - 1):
                for j in range(i + 1, len(df_list)):
                    metric_values_i = df_list[i][df_list[i]['Label'] == label][metric].dropna()
                    metric_values_j = df_list[j][df_list[j]['Label'] == label][metric].dropna()
                    
                    if len(metric_values_i) > 0 and len(metric_values_j) > 0:
                        _, p_value = ttest_rel(metric_values_i, metric_values_j)
                        comparison = f'{group_label}_{i*50} vs {group_label}_{j*50}'
                        stats_df = stats_df.append({
                            'Metric': metric,
                            'Label': label,
                            'Comparison': comparison,
                            'p_value': p_value
                        }, ignore_index=True)
    
    # Apply the Benjamini-Hochberg correction for multiple comparisons
    p_values = stats_df['p_value'].astype(float).to_numpy()
    _, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
    stats_df['p_value_corrected'] = pvals_corrected
    
    return stats_df

# Perform analyses for young and old
stats_df_young = analyze_metrics([df_metrics_young_0, df_metrics_young_50, df_metrics_young_100], 'young')
stats_df_old = analyze_metrics([df_metrics_old_0, df_metrics_old_50, df_metrics_old_100], 'old')

# Save the summary DataFrames to CSV
stats_df_young.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_young_pvalue.csv", index=False)
stats_df_old.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_old_pvalue.csv", index=False)

# Load CSV files and calculate statistics
def process_datasets(parent_path, age_prefix, ratios):
    stats_df = pd.DataFrame(columns=['Age', 'ratio', 'Label', 'Metric', 'Mean', 'CI_Lower', 'CI_Upper'])
    labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]
    metrics_to_analyze = ['AUROC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1 Score', 'FPR', 'FNR']
    
    for ratio in ratios:
        df_metrics = pd.read_csv(parent + f"no_finding/results/age/analysis/csv/metrics_{age_prefix}_{ratio}.csv")
        for label in labels:
            for metric in metrics_to_analyze:
                metric_values = df_metrics[df_metrics['Label'] == label][metric].dropna()
                if not metric_values.empty:
                    mean_value, (ci_lower, ci_upper) = calc_mean_and_ci(metric_values)
                    stats_df = stats_df.append({
                        'Age': age_prefix,
                        'ratio': ratio,
                        'Label': label,
                        'Metric': metric,
                        'Mean': mean_value,
                        'CI_Lower': ci_lower,
                        'CI_Upper': ci_upper
                    }, ignore_index=True)
    
    return stats_df

# Define paths and ratios
parent_path = '/home/jkim/research/peds_cxr/'
ratios = ['0', '25', '50', '75', '100']

# Process datasets for Young and Old
stats_df_young = process_datasets(parent, 'young', ratios)
stats_df_old = process_datasets(parent, 'old', ratios)

# Save the summary DataFrames to CSV
stats_df_young.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_young_stats.csv", index=False)
stats_df_old.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_old_stats.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import pandas as pd
import numpy as np
from matplotlib.ticker import FormatStrFormatter
# Assuming 'parent' is defined as your base path
parent = '/home/jkim/research/peds_cxr/'

# Load your datasets for the old and young age groups
metrics_old_0 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_0.csv') 
metrics_old_100 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_100.csv')
aggregate_old_pvalue = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/aggregate_old_pvalue.csv')

metrics_young_0 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_0.csv') 
metrics_young_100 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_100.csv')
aggregate_young_pvalue = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/aggregate_young_pvalue.csv')

# Combine metrics into two DataFrames with a condition column
metrics_old_0['Condition'] = 'old_0'
metrics_old_100['Condition'] = 'old_100'
combined_metrics_old = pd.concat([metrics_old_0, metrics_old_100])

metrics_young_0['Condition'] = 'young_0'
metrics_young_100['Condition'] = 'young_100'
combined_metrics_young = pd.concat([metrics_young_0, metrics_young_100])

# Assuming 'unique_metrics' and 'unique_labels' are defined based on your needs
# These should be common for both plots, make sure they are correctly initialized
unique_metrics = ['AUROC', 'Sensitivity', 'Specificity', 'FPR', 'FNR', 'PPV', 'NPV']
unique_labels = metrics_old_0['Label'].unique() 
neutral_palette = ["#b5d1ae", "#122740"]

for metric in unique_metrics:
    # Create a subplot with 2 rows and 1 column for each metric
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 16), sharex=True, sharey=True)

    ######################### Plot for the younger age group
    sns.boxplot(x='Label', y=metric, hue='Condition', data=combined_metrics_young, ax=ax1, palette=neutral_palette, order=unique_labels, width=0.4)
    
    # Calculate p_values_young for the current metric
    p_values_young = []
    for label in unique_labels:
        filtered_df = aggregate_young_pvalue[(aggregate_young_pvalue['Metric'] == metric) & 
                                             (aggregate_young_pvalue['Label'] == label) & 
                                             (aggregate_young_pvalue['Comparison'].str.contains('young_0 vs young_100'))]
        p_value = filtered_df['p_value_corrected'].values[0] if not filtered_df.empty else np.nan
        p_values_young.append(p_value)

    # Correctly reference the combined_metrics_young for statistical annotations
    box_pairs_young = [((label, 'young_0'), (label, 'young_100')) for label in unique_labels]
    # Assuming p_values_young is correctly calculated for the young group
    add_stat_annotation(ax1, data=combined_metrics_young, x='Label', y=metric, hue='Condition',
                        box_pairs=box_pairs_young, perform_stat_test=False, pvalues=p_values_young, 
                        test_short_name='Custom', loc='inside', verbose=2, 
                        pvalue_thresholds=[(0.001, '***'), (0.01, '**'), (0.05, '*'), (1, 'ns')])

    # Adjustments to ax1
    ax1.set_title(f'{metric} Testing in 100% Nonschool-Age (Age 0-5) Images', fontsize=20, fontweight='bold')
    ax1.set_ylabel(metric, fontsize=16, fontweight='bold')
    ax1.grid(True, linestyle='--', linewidth=0.5, color='gray', axis='y')
    ax1.tick_params(axis='y', labelsize=14)
    ax1.get_legend().remove()
    ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    ####################################### Plot for the older age group
    sns.boxplot(x='Label', y=metric, hue='Condition', data=combined_metrics_old, ax=ax2, palette=neutral_palette, order=unique_labels, width=0.4)
    
    # Correctly reference the combined_metrics_old for statistical annotations
    old_box_pairs = [((label, 'old_0'), (label, 'old_100')) for label in unique_labels]    
    # Assuming p_values_old is correctly calculated for the old group
    
    p_values_old = []
    for label in unique_labels:
        filtered_df = aggregate_old_pvalue[(aggregate_old_pvalue['Metric'] == metric) & 
                                           (aggregate_old_pvalue['Label'] == label) & 
                                           (aggregate_old_pvalue['Comparison'].str.contains('old_0 vs old_100'))]
        p_value = filtered_df['p_value_corrected'].values[0] if not filtered_df.empty else np.nan
        p_values_old.append(p_value)
    
    add_stat_annotation(ax2, data=combined_metrics_old, x='Label', y=metric, hue='Condition',
                        box_pairs=old_box_pairs, perform_stat_test=False, pvalues=p_values_old, 
                        test_short_name='Custom', loc='inside', verbose=2, 
                        pvalue_thresholds=[(0.001, '***'), (0.01, '**'), (0.05, '*'), (1, 'ns')])

    # Adjustments to ax2
    ax2.set_title(f'{metric} Testing in 100% School-Age (Age 6-17) Images', fontsize=20, fontweight='bold')
    ax2.set_xlabel('Labels', fontsize=16, fontweight='bold')
    ax2.set_ylabel(metric, fontsize=16, fontweight='bold')
    ax2.tick_params(axis='x', labelsize=14, rotation=45)
    ax2.tick_params(axis='y', labelsize=14)
    ax2.grid(True, linestyle='--', linewidth=0.5, color='gray', axis='y')
    ax2.get_legend().remove()
    ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    
    # Correct the legend creation
    handles, labels = ax1.get_legend_handles_labels()  
    new_labels = ['0% School-age' if label == 'young_0' else '100% School-age' for label in labels]
    ax1.legend(handles, new_labels, title="Legend", loc='upper left', bbox_to_anchor=(1, 1))
    
    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.15)  # Adjust the space between the subplots

    # Correct the file path for saving plots
    plot_file_path = parent + f'no_finding/results/age/analysis/plot/age/{metric}_combined_age_analysis.jpeg'
    plt.savefig(plot_file_path, format='jpeg')
    plt.show()  # Show the plot
    plt.close(fig)  # Close the plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import pandas as pd
import numpy as np
from matplotlib.ticker import FormatStrFormatter

# Load datasets
os.makedirs(parent + f'no_finding/results/age/analysis/plot/old/', exist_ok=True)

metrics_old_0 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_0.csv')
metrics_old_25 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_25.csv')
metrics_old_50 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_50.csv')
metrics_old_75 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_75.csv')
metrics_old_100 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_old_100.csv')
aggregate_old_pvalue = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/aggregate_old_pvalue.csv')

# Combine all metrics into a single DataFrame
datasets = [metrics_old_0, metrics_old_25, metrics_old_50, metrics_old_75, metrics_old_100]
conditions = ['0', '25', '50', '75', '100']
for dataset, condition in zip(datasets, conditions):
    dataset['Condition'] = condition
combined_metrics = pd.concat(datasets)

# Unique metrics and labels
unique_metrics = ['AUROC', 'Sensitivity', 'Specificity', 'FPR', 'FNR', 'PPV', 'NPV']
unique_labels = metrics_old_0['Label'].unique()
neutral_palette = ["#b5d1ae", "#80ae9a", "#568b87", "#1b485e", "#122740"]
    
# Create one plot per label
for label in unique_labels:
    for metric in unique_metrics:
        fig, ax = plt.subplots(figsize=(6, 6))
        sns.boxplot(x='Condition', y=metric, data=combined_metrics[combined_metrics['Label'] == label], ax=ax, palette=neutral_palette, order=conditions, width=0.8)
        
        # Title and labels
        ax.set_title(f'{label}', fontsize=22, fontweight='bold')
        ax.set_xlabel('Percent School-Age in Training Data', fontsize=18, fontweight='bold')
        ax.set_ylabel('', fontsize=18, fontweight='bold')
        ax.tick_params(axis='y', labelsize=18)
        ax.tick_params(axis='x', labelsize=18)    

        ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

        # Statistical annotation
        # Define the box pairs for comparison
        box_pairs = [('0', '100'),('50','100'),('0','50')]
        pairs = [('old_0', 'old_100'), ('old_50', 'old_100'), ('old_0', 'old_50')]
        # Extract p-values for the defined comparisons
        p_values = []
        for pair in pairs:
            sub_df = aggregate_old_pvalue[(aggregate_old_pvalue['Metric'] == metric) & 
                                          (aggregate_old_pvalue['Label'] == label) & 
                                          (aggregate_old_pvalue['Comparison'] == f"{pair[0]} vs {pair[1]}")]
            p_value = sub_df['p_value_corrected'].values[0] if not sub_df.empty else np.nan
            p_values.append(p_value)
        
        # Add statistical annotation to the plot
        add_stat_annotation(ax, data=combined_metrics[combined_metrics['Label'] == label], x='Condition', y=metric,
                            box_pairs=box_pairs, perform_stat_test=False, pvalues=p_values, 
                            test_short_name='Custom', loc='inside', verbose=2, 
                            pvalue_thresholds=[(0.001, '***'), (0.01, '**'), (0.05, '*'), (1, 'ns')], fontsize=20)
        
        # Adjust layout
        plt.tight_layout()
        safe_label = label.replace('/', '_')
        plt.savefig(parent + f'no_finding/results/age/analysis/plot/old/{metric}_{safe_label}_old_age.jpeg', format='jpeg')        
        plt.show()
        plt.close(fig)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import pandas as pd
import numpy as np
from matplotlib.ticker import FormatStrFormatter

# Load datasets
metrics_young_0 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_0.csv')
metrics_young_25 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_25.csv')
metrics_young_50 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_50.csv')
metrics_young_75 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_75.csv')
metrics_young_100 = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/metrics_young_100.csv')
aggregate_young_pvalue = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/aggregate_young_pvalue.csv')
os.makedirs(parent + f'no_finding/results/age/analysis/plot/young/', exist_ok=True)

# Combine all metrics into a single DataFrame
datasets = [metrics_young_0, metrics_young_25, metrics_young_50, metrics_young_75, metrics_young_100]
conditions = ['0', '25', '50', '75', '100']
for dataset, condition in zip(datasets, conditions):
    dataset['Condition'] = condition
combined_metrics_young = pd.concat(datasets)

# Unique metrics and labels
unique_metrics = ['AUROC', 'Sensitivity', 'Specificity', 'FPR', 'FNR', 'PPV', 'NPV']
unique_labels = metrics_young_0['Label'].unique()
neutral_palette = ["#b5d1ae", "#80ae9a", "#568b87", "#1b485e", "#122740"]

# Create one plot per label
for label in unique_labels:
    for metric in unique_metrics:
        fig, ax = plt.subplots(figsize=(6, 6))
        sns.boxplot(x='Condition', y=metric, data=combined_metrics_young[combined_metrics_young['Label'] == label], ax=ax, palette=neutral_palette, order=conditions, width=0.8)

        # Title and labels
        ax.set_title(f'{label}', fontsize=22, fontweight='bold')
        ax.set_xlabel('Percent Nonschool-Age in Training Data', fontsize=16, fontweight='bold')
        ax.set_ylabel('', fontsize=18, fontweight='bold')
        ax.tick_params(axis='y', labelsize=18)
        ax.tick_params(axis='x', labelsize=18)
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
                                    
            
        # Statistical annotation
        # Define the box pairs for comparison
        box_pairs = [('0', '100'), ('50', '100'), ('0', '50')]
        pairs = [('young_0', 'young_100'), ('young_50', 'young_100'), ('young_0', 'young_50')]
        # Extract p-values for the defined comparisons
        p_values = []
        for pair in pairs:
            sub_df = aggregate_young_pvalue[(aggregate_young_pvalue['Metric'] == metric) & 
                                            (aggregate_young_pvalue['Label'] == label) & 
                                            (aggregate_young_pvalue['Comparison'] == f"{pair[0]} vs {pair[1]}")]
            p_value = sub_df['p_value_corrected'].values[0] if not sub_df.empty else np.nan
            p_values.append(p_value)
        
        # Add statistical annotation to the plot
        add_stat_annotation(ax, data=combined_metrics_young[combined_metrics_young['Label'] == label], x='Condition', y=metric,
                            box_pairs=box_pairs, perform_stat_test=False, pvalues=p_values, 
                            test_short_name='Custom', loc='inside', verbose=2, 
                            pvalue_thresholds=[(0.001, '***'), (0.01, '**'), (0.05, '*'), (1, 'ns')], fontsize=20)
        
        # Adjust layout
        plt.tight_layout()
        safe_label = label.replace('/', '_')
        plt.savefig(parent + f'no_finding/results/age/analysis/plot/young/{metric}_{safe_label}_young_age.jpeg', format='jpeg')        
        plt.show()
        plt.close(fig)
