In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/home/jkim/research/peds_cxr/25variation/metadata/final/aggregate/aggregate_metadata_seed_9.csv')  # Replace 'path_to_your_file.csv' with the actual file path

# Splitting the dataframe by dataset
nih_df = df[df['Dataset'] == 'NIH']
vindr_df = df[df['Dataset'] == 'VinDR'] 

# Calculating statistics
total_images_nih = nih_df.shape[0]
total_images_vindr = vindr_df.shape[0]
total_images_aggregate = df.shape[0]

total_patients_nih = nih_df['Image Index'].nunique()
total_patients_vindr = vindr_df['Image Index'].nunique()
total_patients_aggregate = df['Image Index'].nunique()

sex_distribution_nih = nih_df['Patient Gender'].value_counts()
sex_distribution_vindr = vindr_df['Patient Gender'].value_counts()
sex_distribution_aggregate = df['Patient Gender'].value_counts()

# Function to calculate corrected age distribution
def calculate_age_distribution(df):
    age_dist = {
        "0-5": len(df[(df['Patient Age'] >= 0) & (df['Patient Age'] <= 5)]),
        "6-9": len(df[(df['Patient Age'] >= 6) & (df['Patient Age'] <= 9)]),
        "10-13": len(df[(df['Patient Age'] >= 10) & (df['Patient Age'] <= 13)]),
        "14-18": len(df[(df['Patient Age'] >= 14) & (df['Patient Age'] <= 18)])
    }
    return age_dist

# Recalculating age distributions
nih_age_distribution = calculate_age_distribution(nih_df)
vindr_age_distribution = calculate_age_distribution(vindr_df)
aggregate_age_distribution = calculate_age_distribution(df)

nih_mean_age = nih_df['Patient Age'].mean()
vindr_mean_age = vindr_df['Patient Age'].mean()
aggregate_mean_age = df['Patient Age'].mean()

nih_age_sd = nih_df['Patient Age'].std()
vindr_age_sd = vindr_df['Patient Age'].std()
aggregate_age_sd = df['Patient Age'].std()

labels = ['No Finding', 'Cardiomegaly', 'Consolidation', 'Infiltration', 'Mass/Nodule', 'Pneumonia']
labels_counts_nih = nih_df[labels].sum()
labels_counts_vindr = vindr_df[labels].sum()
labels_counts_aggregate = df[labels].sum()

# Preparing data for CSV export
stats_dict = {
    "Variable": ["Total # of Images", "Total # of Patients", "Sex - Female", "Sex - Male", "Age 0-5", "Age 6-9", "Age 10-13", "Age 14-18", "Mean patient age ± SD", "No Finding", "Cardiomegaly", "Consolidation", "Infiltration", "Mass/Nodule", "Pneumonia"],
    "VinDR": [total_images_vindr, total_patients_vindr, sex_distribution_vindr.get('F', 0), sex_distribution_vindr.get('M', 0), vindr_age_distribution.get('0-5', 0), vindr_age_distribution.get('6-9', 0), vindr_age_distribution.get('10-13', 0), vindr_age_distribution.get('14-18', 0), f"{vindr_mean_age:.2f} ± {vindr_age_sd:.2f}", labels_counts_vindr['No Finding'], labels_counts_vindr['Cardiomegaly'], labels_counts_vindr['Consolidation'], labels_counts_vindr['Infiltration'], labels_counts_vindr['Mass/Nodule'], labels_counts_vindr['Pneumonia']],
    "NIH": [total_images_nih, total_patients_nih, sex_distribution_nih.get('F', 0), sex_distribution_nih.get('M', 0), nih_age_distribution.get('0-5', 0), nih_age_distribution.get('6-9', 0), nih_age_distribution.get('10-13', 0), nih_age_distribution.get('14-18', 0), f"{nih_mean_age:.2f} ± {nih_age_sd:.2f}", labels_counts_nih['No Finding'], labels_counts_nih['Cardiomegaly'], labels_counts_nih['Consolidation'], labels_counts_nih['Infiltration'], labels_counts_nih['Mass/Nodule'], labels_counts_nih['Pneumonia']],
    "Aggregate": [total_images_aggregate, total_patients_aggregate, sex_distribution_aggregate.get('F', 0), sex_distribution_aggregate.get('M', 0), aggregate_age_distribution.get('0-5', 0), aggregate_age_distribution.get('6-9', 0), aggregate_age_distribution.get('10-13', 0), aggregate_age_distribution.get('14-18', 0), f"{aggregate_mean_age:.2f} ± {aggregate_age_sd:.2f}", labels_counts_aggregate['No Finding'], labels_counts_aggregate['Cardiomegaly'], labels_counts_aggregate['Consolidation'], labels_counts_aggregate['Infiltration'], labels_counts_aggregate['Mass/Nodule'], labels_counts_aggregate['Pneumonia']]
}

# Convert the dictionary to a DataFrame
stats_df = pd.DataFrame(stats_dict)

# Export the DataFrame to CSV
csv_file_path = '/home/jkim/research/peds_cxr/25variation/manuscript/demographics.csv'  # You can change the file name and path as needed
stats_df.to_csv(csv_file_path, index=False)

print(f"Statistics exported to {csv_file_path}")


Statistics exported to /home/jkim/research/peds_cxr/25variation/manuscript/demographics.csv


In [2]:
# Save this script as version_info.py and run it using your terminal or command prompt

import sys
import scipy
import sklearn
import seaborn
import statsmodels

def show_versions():
    print("Python Version:", sys.version)
    print("scipy Version:", scipy.__version__)
    print("sklearn (scikit-learn) Version:", sklearn.__version__)
    print("seaborn Version:", seaborn.__version__)
    print("statsmodels Version:", statsmodels.__version__)

if __name__ == "__main__":
    show_versions()


Python Version: 3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]
scipy Version: 1.9.3
sklearn (scikit-learn) Version: 1.2.2
seaborn Version: 0.12.2
statsmodels Version: 0.14.1
