In [None]:
# Training 25 aggregate models 
import pandas as pd 
import os 
import numpy as np
import subprocess

# File paths and setup
parent = '/home/jkim/research/peds_cxr/'
# Predefined 25 random seeds
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]

### set source
metadata_source = parent + 'no_finding/metadata/aggregate_age_nf/' 

for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        image_source = parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}'
        # Load the data from the CSV file into a pandas DataFrame
        aggregate_metadata_path = metadata_source + f'aggregate_age_nf_seed_{seed}/age_{five}_seed_{seed}.csv'
        aggregate_metadata = pd.read_csv(aggregate_metadata_path)

        # Creating image_paths.txt
        # Filter rows where 'Set' column is not "not assigned"
        filtered_data = aggregate_metadata[aggregate_metadata['Set'] != "not assigned"]
        image_path = filtered_data['Image Index']
        output_image_txt = image_source + '/image_paths.txt'
        image_path.to_csv(output_image_txt, sep=',', index=False, header=False)

        # Creating train.txt 
        train_index = aggregate_metadata[aggregate_metadata['Set']=='train']
        aggregate_train_output = image_source + '/train.txt'
        aggregate_train_txt = train_index['Image Index'] 
        aggregate_train_txt.to_csv(aggregate_train_output, sep=',', index=False, header=False)

        # Creating val.txt 
        val_index = aggregate_metadata[aggregate_metadata['Set']=='val']
        aggregate_val_output = image_source + '/val.txt'
        aggregate_val_txt = val_index['Image Index'] 
        aggregate_val_txt.to_csv(aggregate_val_output, sep=',', index=False, header=False)

        # Creating test.txt
        test_index = aggregate_metadata[aggregate_metadata['Set']=='test']
        aggregate_test_output = image_source + '/aggregate_test.txt'
        aggregate_test_txt = test_index['Image Index'] 
        aggregate_test_txt.to_csv(aggregate_test_output, sep=',', index=False, header=False)

        # Creating labels.csv
        aggregate_labels_output = image_source + '/labels.csv'
        aggregate_labels = filtered_data[['Image Index', 'No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']]
        # rename Image Index to Image
        aggregate_labels = aggregate_labels.rename(columns={'Image Index': 'Image'})
        aggregate_labels.to_csv(aggregate_labels_output, sep=',', index=False)

        # confirming the number of train val and test. 
        print(f'age, seed: {seed}, {five}%')
        print("# of train:",len(aggregate_train_txt))
        print('# of val:',len(aggregate_val_txt))
        print('# of test:',len(aggregate_test_txt))
        print('# of image path:',len(image_path))
        print('# of labels:',len(aggregate_labels))


In [None]:
# Create json file. 

import json
import os

# Loop through each seed to generate JSON files
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        json_template = {
        "data_dir": f"/home/jkim/research/peds_cxr/no_finding/images/age/{seed}/",
        "labels": ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule' , 'Pneumonia'],
        "dataset_nicenames": {
            "nihcxr14_test": "NIH CXR14",
            "chexpert_test": "CheXpert",
            "mimic_test": "MIMIC",
            "padchest_test": "PadChest",
            "rsna_pneumonia_test_100": "RSNA Pneumonia",
            "rsna_pneumonia_DeiT-B_test_100": "RSNA Pneumonia DeiT-B",
            "Aggregate_DenseNet121_aggregate_test": "aggregate DenseNet121",
            "Aggregate_DenseNet121_aggregate_test": "Aggregate DenseNet121"
            }
        }
        
        # Construct the new directory path
        new_data_dir = f"/home/jkim/research/peds_cxr/no_finding/images/age/{seed}/age_{five}%_seed{seed}/"
        
        # Create the new directory if it doesn't exist
        if not os.path.exists(new_data_dir):
            print(f"The directory {new_data_dir} does not exist. Creating it.")
            os.makedirs(new_data_dir)

        # Specify the path for the new JSON file
        new_json_path = os.path.join(new_data_dir, f"cfg_aggregate_{seed}.json")

        # Write the JSON file
        with open(new_json_path, 'w') as f:
            json.dump(json_template, f, indent=4)
        
        print(f"Created new JSON file at {new_json_path}")

# batch creating result directiory
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        # Construct the new directory path for results
        new_result_dir = f"/home/jkim/research/peds_cxr/no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/"
        
        # Create the new directory if it doesn't exist
        if not os.path.exists(new_result_dir):
            os.makedirs(new_result_dir)

In [None]:

#training 25 aggregate models
optimizers = ['SGD']
initial_lrs = [0.01]
weight_decays = [1e-5]
dropouts = [0.5]
gpu='0' 

parent = '/home/jkim/research/peds_cxr/'
import subprocess
    
# Iterate over the hyperparameters
for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:    
        for optimizer in optimizers:
            for initial_lr in initial_lrs:
                for weight_decay in weight_decays:
                    for dropout in dropouts:
                        # Construct the command with updated hyperparameters
                        command = [
                            'python3',
                            parent + 'transformer-radiographs/train_cxr.py',
                            '--cfg-dir',
                            parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}/cfg_aggregate_{seed}.json',
                            '--dataset',
                            f'age_{five}%_seed{seed}',
                            '--labels-set',
                            'labels',
                            '--architecture',
                            'DenseNet121',
                            '--results-dir',
                            parent + f'no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/',
                            '--optimizer-family',
                            optimizer,
                            '--dropout',
                            str(dropout),
                            '--weight-decay',
                            str(weight_decay),
                            '--initial-lr',
                            str(initial_lr),
                            '--drop-factor',
                            '0.1',
                            '--plateau-patience',
                            '3',
                            '--plateau-threshold',
                            '1e-4',
                            '--break-patience',
                            '5',
                            '--train-transform',
                            'peds',
                            '--train-file',
                            'train.txt',
                            '--val-file',
                            'val.txt',
                            '--use-gpus',
                            gpu
                        ]

                        # Execute the command
                        subprocess.run(command)

In [8]:
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]


# testing 25 aggregate models - DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_1705144996.txt
import subprocess
import pandas as pd 
import os
import glob
# batch creating result directiory
for seed in predefined_seeds:
    for dir in ['all', 'young', 'old']:  
        for five in ['0', '25', '50', '75', '100']:
            # Construct the new directory path for results
            new_result_dir = parent + f"no_finding/results/age/test/{dir}/{seed}/age_{five}_{seed}_{dir}/"

            # Create the new directory if it doesn't exist
            if not os.path.exists(new_result_dir):
                os.makedirs(new_result_dir)

for seed in predefined_seeds:
    for five in ['0', '25', '50', '75', '100']:
        aggregate_metadata_path = parent + f'no_finding/metadata/aggregate_age_nf/aggregate_age_nf_seed_{seed}/age_{five}_seed_{seed}.csv'
        aggregate_metadata = pd.read_csv(aggregate_metadata_path)

        # Creating aggregate_test_young / old .txt
        ages = ['young', 'old']
        for age in ages:
            test_index = aggregate_metadata[(aggregate_metadata['Set'] == 'test') & (aggregate_metadata['Age Group'] == age)]
            image_source = parent + f'no_finding/images/age/{seed}/age_{five}%_seed{seed}/'
            aggregate_test_output = image_source + 'aggregate_test_'+ age +'.txt'
            aggregate_test_txt = test_index['Image Index']
            aggregate_test_txt.to_csv(aggregate_test_output, sep=',', index=False, header=False)

In [None]:
# testing 25 aggregate models - DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_1705144996.txt
import os
import subprocess
import pandas as pd 
import glob


ages = ['old', 'young']
# Testing old vs young
for age in ages:
    for seed in predefined_seeds:
        for five in ['0', '25', '50', '75', '100']:
        # Build the pattern to search for model files
            search_pattern = os.path.join(parent, f"no_finding/results/age/train/{seed}/age_{five}%_seed{seed}/DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_*_model.pt")
            
            # Use glob to find the file
            matching_files = glob.glob(search_pattern)
            
            # Check if any matching files are found
            if not matching_files:
                print(f"No matching model files found for seed {seed}.")
                continue  # Skip to the next iteration

            # Assuming there's only one matching file, take the first one
            model_file_path = matching_files[0]

            image_source = parent + f"no_finding/images/age/{seed}/age_{five}%_seed{seed}/"
            command = [
                "python3",
                parent + "transformer-radiographs/test_cxr.py",
                "--cfg-dir", image_source + f"cfg_aggregate_{seed}.json",
                "--dataset", f'age_{five}%_seed{seed}',
                "--labels-set", "labels",
                "--model-state", model_file_path,
                "--model-type", "DenseNet121",
                "--results-dir", parent + f"no_finding/results/age/test/{age}/{seed}/age_{five}_{seed}_{age}/",
                "--test-file", image_source + "aggregate_test_" + age + ".txt",
                "--use-gpus", gpu
        ]

            # Run the command
            try:
                subprocess.run(command, check=True)
            except subprocess.CalledProcessError as e:
                print(f"An error occurred while executing the command: {e}")

In [10]:
import pickle
import pandas as pd
import os 
import glob

# setting directories
parent = '/home/jkim/research/peds_cxr/'

ages = ['old','young']
age_file_name = ['_old', '_young']
for age, age_name in zip(ages, age_file_name):
    for seed in predefined_seeds:
        for five in ['0', '25', '50', '75', '100']:       
            # Build the pattern to search for model files
                search_pattern = os.path.join(parent, f"no_finding/results/age/test/{age}/{seed}/age_{five}_{seed}_{age}/DenseNet121_lr0.01_bs16_optSGD_wd1e-05_sch_step_pp3_bp5_trtrain.txt_vaval.txt_tfpeds_nlbatch_do0.5_*.pkl")

                # Use glob to find the file
                matching_files = glob.glob(search_pattern)
                
                # Check if any matching files are found
                if not matching_files:
                    print(f"No matching model files found for seed {seed}.")
                    continue  # Skip to the next iteration

                pkl_directory = matching_files[0]

                directory = parent + f'no_finding/results/age/unpickle/{age}/'
                if not os.path.exists(directory):
                    os.makedirs(directory)

                # Open the pickle file and load the data
                with open(pkl_directory, "rb") as f:
                    data = pickle.load(f)

                # Extract the nested dictionary
                nested_dict = data.get(f'age_{five}%_seed{seed}_DenseNet121_aggregate_test_{age}', {})

                # Create a DataFrame from the nested dictionary with keys 'y', 'yhat', and 'file'
                df_nested = pd.DataFrame({
                    "y": nested_dict["y"],
                    "yhat": nested_dict["yhat"],
                    "file": nested_dict["file"]
                })

                # Define the file path
                nested_file_path = directory + f'aggregate_{seed}_{age}_{five}_pkl.csv'

                # Save the DataFrame to a CSV file
                df_nested.to_csv(nested_file_path, index=False)


In [11]:
# Saving Pkl file as csv file 

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
import ast
import os
import matplotlib.pyplot as plt

# Changing pickle file from dictionary format to list format 
ages = ['old', 'young']

# create AUC directory in results
if not os.path.exists(parent + 'no_finding/results/age/analysis/'):
    os.makedirs(parent + 'no_finding/results/age/analysis/')
if not os.path.exists(parent + 'no_finding/results/age/analysis/plot/age'):
    os.makedirs(parent + 'no_finding/results/age/analysis/plot/age')
if not os.path.exists(parent + 'no_finding/results/age/analysis/csv'):
    os.makedirs(parent + 'no_finding/results/age/analysis/csv')

for age in ages: 
    os.makedirs(parent + f'no_finding/results/age/analysis/unpickledcsv/{age}', exist_ok=True)  

for seed in predefined_seeds:        
    for age in ages: 
        for five in ['0', '25', '50', '75', '100']:
            # set directories
            pkl_dir = parent + f'no_finding/results/age/unpickle/{age}/aggregate_{seed}_{age}_{five}_pkl.csv'

            # Loading the pkl file
            aggregate_pkl = pd.read_csv(pkl_dir)

            # Defining the aggregate_label as the list of labels
            aggregate_label = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]

            # Converting the string representation of lists to actual lists
            aggregate_pkl['y'] = aggregate_pkl['y'].apply(ast.literal_eval)
            aggregate_pkl['yhat'] = aggregate_pkl['yhat'].apply(ast.literal_eval)

            # Splitting the one-hot encoded labels and predictions into separate columns
            y_true_df = pd.DataFrame(aggregate_pkl['y'].tolist(), columns=aggregate_label)
            y_pred_df = pd.DataFrame(aggregate_pkl['yhat'].tolist(), columns=aggregate_label)

            # Merging the true labels and predicted probabilities based on index
            result_df = pd.concat([y_true_df, y_pred_df.add_suffix('_pred')], axis=1)

            # Saving
            result_df.to_csv(parent + f'no_finding/results/age/analysis/unpickledcsv/{age}/aggregate_{seed}_{age}_{five}_pkl.csv', index=False)


In [None]:
# stats and plotting for all aggregate data
import os
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix

parent = '/home/jkim/research/peds_cxr/'
predefined_seeds = [2358, 4563, 7894, 13289, 15892, 19756, 23890, 27456, 30890, 34905, 38764, 42136, 46578, 50349, 54892, 59871, 63912, 68354, 72109, 76845, 80492, 84910, 89357, 93450, 97910]

# Initialize variables
labels = ["No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]

def calculate_metrics_and_auroc(df, label):
    true_labels = df[label]
    predicted_scores = df[f"{label}_pred"]
    
    # Calculate AUROC
    auroc = roc_auc_score(true_labels, predicted_scores)

    # Calculate optimal threshold using Youden's Index
    fpr, tpr, thresholds = roc_curve(true_labels, predicted_scores)
    youden_index = tpr - fpr
    optimal_idx = np.argmax(youden_index)
    optimal_threshold = thresholds[optimal_idx]

    # Other metrics
    predicted_labels = (predicted_scores >= optimal_threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    ppv = tp / (tp + fp) if tp + fp != 0 else 0
    npv = tn / (tn + fn) if tn + fn != 0 else 0
    f1_score = 2 * (ppv * sensitivity) / (ppv + sensitivity) if ppv + sensitivity != 0 else 0
    fpr = fp / (fp + tn)  
    fnr = fn / (fn + tp)
    
    return [auroc, sensitivity, specificity, ppv, npv, f1_score, fpr, fnr, optimal_threshold]

# Initialize empty DataFrames for storing metrics
cols = ['Seed', 'Label', 'AUROC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1 Score', 'FPR', 'FNR', 'Youden_Threshold'] 
metrics_dfs = {
    "young_0": pd.DataFrame(columns=cols),
    "young_25": pd.DataFrame(columns=cols),
    "young_50": pd.DataFrame(columns=cols),
    "young_75": pd.DataFrame(columns=cols),
    "young_100": pd.DataFrame(columns=cols),
    "old_0": pd.DataFrame(columns=cols),
    "old_25": pd.DataFrame(columns=cols),
    "old_50": pd.DataFrame(columns=cols),
    "old_75": pd.DataFrame(columns=cols),
    "old_100": pd.DataFrame(columns=cols),
}

# Loop through subgroups ('young', 'old') and seeds
for subgroup in ["young", "old"]:
    for seed in predefined_seeds:
        dfs = {
            "young_0": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/young/aggregate_{seed}_young_0_pkl.csv"),
            "young_25": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/young/aggregate_{seed}_young_25_pkl.csv"),
            "young_50": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/young/aggregate_{seed}_young_50_pkl.csv"),
            "young_75": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/young/aggregate_{seed}_young_75_pkl.csv"),
            "young_100": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/young/aggregate_{seed}_young_100_pkl.csv"),
            "old_0": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/old/aggregate_{seed}_old_0_pkl.csv"),
            "old_25": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/old/aggregate_{seed}_old_25_pkl.csv"),
            "old_50": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/old/aggregate_{seed}_old_50_pkl.csv"),
            "old_75": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/old/aggregate_{seed}_old_75_pkl.csv"),
            "old_100": pd.read_csv(parent + f"no_finding/results/age_{subgroup}/analysis/unpickledcsv/old/aggregate_{seed}_old_100_pkl.csv"),
        }

        for key, df in dfs.items():
            label_metrics = []
            for label in labels:
                metrics = calculate_metrics_and_auroc(df, label)
                label_metrics.append(metrics)
                metrics_dfs[key] = metrics_dfs[key].append(pd.Series([seed, label] + metrics, index=cols), ignore_index=True)

            # Calculate and append the "All Label" metrics
            all_label_metrics = np.mean(label_metrics, axis=0).tolist()
            metrics_dfs[key] = metrics_dfs[key].append(pd.Series([seed, "All Label"] + all_label_metrics, index=cols), ignore_index=True)

    # Save the metrics DataFrames to CSV for each subgroup
    for key, df in metrics_dfs.items():
        df.to_csv(parent + f"no_finding/results/age_{subgroup}/analysis/csv/metrics_{key}.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: '/home/jkim/research/peds_cxr/no_finding/results/age_young/analysis/unpickledcsv/young/aggregate_2358_young_0_pkl.csv'

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import multipletests

# Define paths and labels
parent = '/home/jkim/research/peds_cxr/'
labels = ["All Label", "No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"]
metrics_to_analyze = ['AUROC', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'F1 Score', 'FPR', 'FNR']
ratios = ['0', '25', '50', '75', '100']

# File paths for Young and Old datasets
young_files = [f"{parent}no_finding/results/age/analysis/csv/metrics_young_{ratio}.csv" for ratio in ratios]
old_files = [f"{parent}no_finding/results/age/analysis/csv/metrics_old_{ratio}.csv" for ratio in ratios]

# Define function to perform comparisons and generate combined stats
def analyze_and_combine(young_files, old_files, ratios):
    stats_df = pd.DataFrame(columns=['Metric', 'Label', 'Comparison', 'p_value', 'p_value_corrected'])
    
    # Perform Young vs Old comparisons
    for ratio, y_file, o_file in zip(ratios, young_files, old_files):
        df_metrics_young = pd.read_csv(y_file)
        df_metrics_old = pd.read_csv(o_file)
        
        for metric in metrics_to_analyze:
            for label in labels:
                metric_values_young = df_metrics_young[df_metrics_young['Label'] == label][metric].dropna()
                metric_values_old = df_metrics_old[df_metrics_old['Label'] == label][metric].dropna()
                
                if len(metric_values_young) > 0 and len(metric_values_old) > 0:
                    _, p_value = ttest_rel(metric_values_young, metric_values_old)
                    stats_df = stats_df.append({
                        'Metric': metric,
                        'Label': label,
                        'Comparison': f'young_{ratio} vs old_{ratio}',
                        'p_value': p_value
                    }, ignore_index=True)
    
    # Perform cross-ratio comparisons (e.g., young_0 vs old_100 and young_100 vs old_0)
    cross_comparisons = [('0', '100'), ('100', '0')]
    for y_ratio, o_ratio in cross_comparisons:
        y_file = f"{parent}no_finding/results/age/analysis/csv/metrics_young_{y_ratio}.csv"
        o_file = f"{parent}no_finding/results/age/analysis/csv/metrics_old_{o_ratio}.csv"
        df_metrics_young = pd.read_csv(y_file)
        df_metrics_old = pd.read_csv(o_file)

        for metric in metrics_to_analyze:
            for label in labels:
                metric_values_young = df_metrics_young[df_metrics_young['Label'] == label][metric].dropna()
                metric_values_old = df_metrics_old[df_metrics_old['Label'] == label][metric].dropna()

                if len(metric_values_young) > 0 and len(metric_values_old) > 0:
                    _, p_value = ttest_rel(metric_values_young, metric_values_old)
                    stats_df = stats_df.append({
                        'Metric': metric,
                        'Label': label,
                        'Comparison': f'young_{y_ratio} vs old_{o_ratio}',
                        'p_value': p_value
                    }, ignore_index=True)

    # Perform within-group comparisons (e.g., Old_0 vs Old_50)
    def analyze_within_group(df_list, group_label):
        local_stats = pd.DataFrame(columns=['Metric', 'Label', 'Comparison', 'p_value'])
        
        for metric in metrics_to_analyze:
            for label in labels:
                for i in range(len(df_list) - 1):
                    for j in range(i + 1, len(df_list)):
                        metric_values_i = df_list[i][df_list[i]['Label'] == label][metric].dropna()
                        metric_values_j = df_list[j][df_list[j]['Label'] == label][metric].dropna()
                        
                        if len(metric_values_i) > 0 and len(metric_values_j) > 0:
                            _, p_value = ttest_rel(metric_values_i, metric_values_j)
                            comparison = f'{group_label}_{ratios[i]} vs {group_label}_{ratios[j]}'
                            local_stats = local_stats.append({
                                'Metric': metric,
                                'Label': label,
                                'Comparison': comparison,
                                'p_value': p_value
                            }, ignore_index=True)
        
        return local_stats

    # Load datasets for within-group comparisons
    young_datasets = [pd.read_csv(f) for f in young_files]
    old_datasets = [pd.read_csv(f) for f in old_files]

    stats_df_young = analyze_within_group(young_datasets, 'young')
    stats_df_old = analyze_within_group(old_datasets, 'old')
    
    # Combine results
    combined_stats = pd.concat([stats_df, stats_df_young, stats_df_old], ignore_index=True)
    
    # Apply Benjamini-Hochberg correction for multiple comparisons
    p_values = combined_stats['p_value'].astype(float).to_numpy()
    _, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
    combined_stats['p_value_corrected'] = pvals_corrected
    
    return combined_stats

# Analyze and combine results
final_stats_df = analyze_and_combine(young_files, old_files, ratios)

# Save the final combined CSV
final_stats_df.to_csv(parent + "no_finding/results/age/analysis/csv/final_pvalues.csv", index=False)

######################################################################

# Load CSV files and calculate statistics
def process_datasets(age_prefix, ratios):
    stats_df = pd.DataFrame(columns=['Age', 'ratio', 'Label', 'Metric', 'Mean', 'CI_Lower', 'CI_Upper'])
    
    for ratio in ratios:
        df_metrics = pd.read_csv(parent + f"no_finding/results/age/analysis/csv/metrics_{age_prefix}_{ratio}.csv")
        for label in labels:
            for metric in metrics_to_analyze:
                metric_values = df_metrics[df_metrics['Label'] == label][metric].dropna()
                if not metric_values.empty:
                    mean_value = metric_values.mean()
                    stderr = metric_values.sem()
                    ci = stderr * 1.96  # For 95% confidence interval
                    ci_lower = mean_value - ci
                    ci_upper = mean_value + ci
                    stats_df = stats_df.append({
                        'Age': age_prefix,
                        'ratio': ratio,
                        'Label': label,
                        'Metric': metric,
                        'Mean': mean_value,
                        'CI_Lower': ci_lower,
                        'CI_Upper': ci_upper
                    }, ignore_index=True)
    
    return stats_df

# Process datasets for Young and Old
stats_df_young = process_datasets('young', ratios)
stats_df_old = process_datasets('old', ratios)

# Save the summary DataFrames to CSV
stats_df_young.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_young_stats.csv", index=False)
stats_df_old.to_csv(parent + "no_finding/results/age/analysis/csv/aggregate_old_stats.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import pandas as pd
import numpy as np

# Define parent path
parent = '/home/jkim/research/peds_cxr/'

# Load datasets for the young and old age groups
ratios = ['0', '25', '50', '75', '100']
metrics_young = {ratio: pd.read_csv(parent + f'no_finding/results/age/analysis/csv/metrics_young_{ratio}.csv') for ratio in ratios}
metrics_old = {ratio: pd.read_csv(parent + f'no_finding/results/age/analysis/csv/metrics_old_{ratio}.csv') for ratio in ratios}
aggregate_young_vs_old_pvalue = pd.read_csv(parent + 'no_finding/results/age/analysis/csv/final_pvalues.csv')

# Define unique metrics and labels
unique_metrics = ['AUROC', 'Sensitivity', 'Specificity', 'FPR', 'FNR', 'PPV', 'NPV']
unique_labels = metrics_young['0']['Label'].unique()
neutral_palette = ["#b5d1ae", "#1b485e"]

# Iterate over each ratio and metric to create plots comparing young and old groups
for ratio in ratios:
    # Add a "Condition" column and combine metrics for the current ratio
    metrics_young[ratio]['Condition'] = 'Young'
    metrics_old[ratio]['Condition'] = 'Old'
    combined_metrics = pd.concat([metrics_young[ratio], metrics_old[ratio]])

    for metric in unique_metrics:
        # Create a single plot for each metric and ratio
        fig, ax = plt.subplots(figsize=(12, 8))

        # Plot the boxplots for young and old comparison
        sns.boxplot(x='Label', y=metric, hue='Condition', data=combined_metrics, ax=ax, palette=neutral_palette, order=unique_labels, width=0.4)

        # Calculate p-values for the current metric
        p_values = []
        for label in unique_labels:
            filtered_df = aggregate_young_vs_old_pvalue[
                (aggregate_young_vs_old_pvalue['Metric'] == metric) & 
                (aggregate_young_vs_old_pvalue['Label'] == label) & 
                (aggregate_young_vs_old_pvalue['Comparison'].str.contains(f'young_{ratio} vs old_{ratio}'))
            ]
            p_value = filtered_df['p_value_corrected'].values[0] if not filtered_df.empty else np.nan
            p_values.append(p_value)

        # Add statistical annotation for young vs old comparison
        box_pairs = [((label, 'Young'), (label, 'Old')) for label in unique_labels]
        add_stat_annotation(ax, data=combined_metrics, x='Label', y=metric, hue='Condition',
                            box_pairs=box_pairs, perform_stat_test=False, pvalues=p_values, 
                            test_short_name='Custom', loc='inside', verbose=2, 
                            pvalue_thresholds=[(0.001, '***'), (0.01, '**'), (0.05, '*'), (1, 'ns')])
        ax.set_title(f'{metric} Comparison Between Young and Old ({ratio}% Old in Training Set)', fontsize=20, fontweight='bold')
        ax.set_xlabel('Labels', fontsize=16, fontweight='bold')
        ax.set_ylabel(metric, fontsize=16, fontweight='bold')
        ax.tick_params(axis='x', labelsize=14, rotation=45)
        ax.tick_params(axis='y', labelsize=14)
        ax.grid(True, linestyle='--', linewidth=0.5, color='gray', axis='y')

        # Create a combined legend
        handles, labels = ax.get_legend_handles_labels()  
        new_labels = ['Young', 'Old']
        ax.legend(handles, new_labels, title="Legend", loc='upper left', bbox_to_anchor=(1, 1))

        # Adjust layout and save
        plt.tight_layout()
        plot_file_path = parent + f'no_finding/results/age/analysis/plot/age/{metric}_young_{ratio}_vs_old_{ratio}_analysis.jpeg'
        plt.savefig(plot_file_path, format='jpeg')
        plt.show()
        plt.close(fig)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator

# Load your datasets
parent = '/home/jkim/research/peds_cxr/'

file_paths = {
    "metrics_young_0": parent + 'no_finding/results/age/analysis/csv/metrics_young_0.csv',
    "metrics_young_25": parent + 'no_finding/results/age/analysis/csv/metrics_young_25.csv',
    "metrics_young_50": parent + 'no_finding/results/age/analysis/csv/metrics_young_50.csv',
    "metrics_young_75": parent + 'no_finding/results/age/analysis/csv/metrics_young_75.csv',
    "metrics_young_100": parent + 'no_finding/results/age/analysis/csv/metrics_young_100.csv',
    "metrics_old_0": parent + 'no_finding/results/age/analysis/csv/metrics_old_0.csv',
    "metrics_old_25": parent + 'no_finding/results/age/analysis/csv/metrics_old_25.csv',
    "metrics_old_50": parent + 'no_finding/results/age/analysis/csv/metrics_old_50.csv',
    "metrics_old_75": parent + 'no_finding/results/age/analysis/csv/metrics_old_75.csv',
    "metrics_old_100": parent + 'no_finding/results/age/analysis/csv/metrics_old_100.csv'
}

data = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Combine all datasets into a single DataFrame for easier plotting
combined_data = pd.DataFrame()
for name, df in data.items():
    condition = name.split('_')[1]  # Either "young" or "old"
    ratio = name.split('_')[2]  
    df['Condition'] = 'Young' if condition == 'young' else 'Old'
    df['Group'] = condition  
    df['Ratio'] = ratio
    combined_data = pd.concat([combined_data, df], ignore_index=True)

# Ensure correct data types for plotting
combined_data['Ratio'] = pd.Categorical(combined_data['Ratio'], categories=['0', '25', '50', '75', '100'], ordered=True)

# Load p-values dataset
pvalues_file_path = parent + 'no_finding/results/age/analysis/csv/final_pvalues.csv'
pvalues_data = pd.read_csv(pvalues_file_path)

# Define labels and metric of interest
labels = ['All Label', 'No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']
label_names = ['All Label', 'No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'MassNodule', 'Pneumonia']
metric = 'AUROC'
neutral_palette = ["#b5d1ae", "#80ae9a", "#568b87", "#1b485e", "#122740"]

# Define the merged comparison pairs
comparison_pairs = [
    (("young", "100"), ("old", "0")),
    (("young", "50"), ("old", "50")),
    (("young", "0"), ("old", "100")),
    (("young", "0"), ("young", "50")),
    (("young", "50"), ("young", "100")),
    (("young", "0"), ("young", "100")),
    (("old", "0"), ("old", "50")),
    (("old", "50"), ("old", "100")),
    (("old", "0"), ("old", "100"))
]

for label, label_name in zip(labels, label_names):
    # Filter data for the current label
    filtered_data = combined_data[combined_data['Label'] == label]

    # Extract relevant p-values for the comparison pairs
    comparison_pvalues = []
    for pair in comparison_pairs:
        comparison = f"{pair[0][0]}_{pair[0][1]} vs {pair[1][0]}_{pair[1][1]}"
        p_value_row = pvalues_data[(pvalues_data['Label'] == label) &
                                   (pvalues_data['Metric'] == metric) &
                                   (pvalues_data['Comparison'] == comparison)]
        if not p_value_row.empty:
            comparison_pvalues.append(p_value_row['p_value_corrected'].values[0])
        else:
            comparison_pvalues.append(None)
            print('ERROR')

    # Check if there is data for the current label
    if not filtered_data.empty:
        plt.figure(figsize=(14, 8))

        # Create the boxplot with two groups (Young and Old)
        ax = sns.boxplot(
            x='Group',
            y=metric,
            hue='Ratio',
            data=filtered_data,
            palette=neutral_palette + neutral_palette,
            dodge=True
        )

        # Annotate all comparisons
        annotator = Annotator(ax, comparison_pairs, data=filtered_data, x='Group', y=metric, hue='Ratio')
        annotator.set_pvalues(comparison_pvalues)
        annotator.configure(text_format="star", loc="inside", fontsize=12)
        annotator.annotate()

        # Customize the plot
        plt.title(f'{label}', fontsize=20, fontweight='bold', pad=20)
        plt.xlabel('Age Group', fontsize=18)
        plt.ylabel('AUROC', fontsize=18)
        # plt.legend(title='Training Set Schoolage Ratio (%)', loc='upper right')
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.subplots_adjust(top=0.85)
        ax.legend_.remove()


        plt.tight_layout()

        # Show the plot
        plt.savefig(parent + f'no_finding/results/age/analysis/plot/age/{metric}_YoungvsOld_{label_name}.jpeg')
        plt.show()
    else:
        print(f"No data available for label: {label}")
