In [2]:
# creating 25 metadata for vindr
import pandas as pd
import numpy as np
import os

# File paths and setup
parent = '/home/jkim/research/peds_cxr/'
metadata_source = parent + 'peds_cxr_metadata/processed_metadata/'
output_path = parent + 'peds_cxr_metadata/final_metadata/'

# Predefined 25 random seeds
predefined_seeds = [9,17,18,29,30,34,41,42,51,52,61,66,74,75,78,81,84,86,87,89,90,92,96,98,99]
np.random.seed(42)

# Load the data
vindr_metadata = pd.read_csv(metadata_source + 'vindr_14labels.csv')

# Initialize 'Set' column
vindr_metadata['Set'] = np.nan

# Perform the 70-10-20 train-val-test split 
mask_rest = pd.isnull(vindr_metadata['Set'])
vindr_metadata.loc[mask_rest, 'Set'] = np.random.choice(['train', 'val', 'test'], size=mask_rest.sum(), p=[0.7, 0.1, 0.2])

# Create 25 different combinations of training and validation sets
for seed in predefined_seeds:
    np.random.seed(seed)
    vindr_metadata_copy = vindr_metadata.copy()
    
    # Shuffle only the remaining 'train' and 'val' records
    mask_train_val = vindr_metadata_copy['Set'].isin(['train', 'val']) 
    vindr_metadata_copy.loc[mask_train_val, 'Set'] = np.random.choice(['train', 'val'], size=mask_train_val.sum(), p=[0.875, 0.125])
    
    # Save the metadata
    output_folder = parent + "/25variation/metadata/preprocessed/vindr/"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    vindr_metadata_copy.to_csv(output_folder + f"vindr_metadata_seed_{seed}.csv", index=False)
    
    print(f"VinDR metadata shape for seed {seed}: {vindr_metadata_copy.shape}")
    print(f"VinDR set count for seed {seed}:")
    print(vindr_metadata_copy['Set'].value_counts())

VinDR metadata shape for seed 9: (8268, 19)
VinDR set count for seed 9:
train    5821
test     1637
val       810
Name: Set, dtype: int64
VinDR metadata shape for seed 17: (8268, 19)
VinDR set count for seed 17:
train    5874
test     1637
val       757
Name: Set, dtype: int64
VinDR metadata shape for seed 18: (8268, 19)
VinDR set count for seed 18:
train    5836
test     1637
val       795
Name: Set, dtype: int64
VinDR metadata shape for seed 29: (8268, 19)
VinDR set count for seed 29:
train    5776
test     1637
val       855
Name: Set, dtype: int64
VinDR metadata shape for seed 30: (8268, 19)
VinDR set count for seed 30:
train    5789
test     1637
val       842
Name: Set, dtype: int64
VinDR metadata shape for seed 34: (8268, 19)
VinDR set count for seed 34:
train    5847
test     1637
val       784
Name: Set, dtype: int64
VinDR metadata shape for seed 41: (8268, 19)
VinDR set count for seed 41:
train    5833
test     1637
val       798
Name: Set, dtype: int64
VinDR metadata shape f

In [3]:
# Creating 25 metadata for NIH
import pandas as pd
import os
import numpy as np

# Load the data
nih_metadata = pd.read_csv(metadata_source + 'nih_14labels.csv')

# Extract patient IDs
def extract_patient_id(image_name):
    return image_name.split('_')[0]

nih_metadata['patient_ids'] = nih_metadata['Image Index'].apply(extract_patient_id)
unique_patient_ids = nih_metadata['patient_ids'].unique()

# Shuffle the patient IDs to create the test set
np.random.seed(42)
np.random.shuffle(unique_patient_ids)

# Assign 70-10-20 ratio for train-val-test
n_total = len(unique_patient_ids)
n_train = round(n_total * 0.7)
n_val = round(n_total * 0.1)
n_test = n_total - n_train - n_val

train_ids = unique_patient_ids[:n_train]
val_ids = unique_patient_ids[n_train:n_train + n_val]
test_ids = unique_patient_ids[n_train + n_val:]

# Assign 'Set' to the DataFrame based on patient IDs
def assign_set(patient_id):
    if patient_id in train_ids:
        return 'train'
    elif patient_id in val_ids:
        return 'val'
    else:
        return 'test'

nih_metadata['Set'] = nih_metadata['patient_ids'].apply(assign_set)


# Create 25 different combinations of training and validation sets
for seed in predefined_seeds:
    np.random.seed(seed)
    nih_metadata_copy = nih_metadata.copy()
    
    # Shuffle only the 'train' and 'val' records
    mask_train_val = nih_metadata_copy['Set'].isin(['train', 'val'])
    shuffled_train_val_ids = nih_metadata_copy.loc[mask_train_val, 'patient_ids'].unique()
    np.random.shuffle(shuffled_train_val_ids)

    # Re-assign 'train' and 'val'
    n_train_new = round(len(shuffled_train_val_ids) * 0.875)
    train_ids_new = shuffled_train_val_ids[:n_train_new]
    val_ids_new = shuffled_train_val_ids[n_train_new:]
    
    def reassign_set(patient_id):
        if patient_id in train_ids_new:
            return 'train'
        elif patient_id in val_ids_new:
            return 'val'
        else:
            return 'test'
    
    nih_metadata_copy['Set'] = nih_metadata_copy['patient_ids'].apply(reassign_set)
    
    nih_metadata_copy.drop(columns=['patient_ids'], inplace=True)

    # Save the metadata
    output_folder = parent + "/25variation/metadata/preprocessed/nih/"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    nih_metadata_copy.to_csv(output_folder + f"nih_metadata_seed_{seed}.csv", index=False)

    print(f"NIH metadata shape for seed {seed}: {nih_metadata_copy.shape}")
    print(f"NIH set count for seed {seed}:")
    print(nih_metadata_copy['Set'].value_counts())


NIH metadata shape for seed 9: (5241, 19)
NIH set count for seed 9:
train    3467
test     1156
val       618
Name: Set, dtype: int64
NIH metadata shape for seed 17: (5241, 19)
NIH set count for seed 17:
train    3409
test     1156
val       676
Name: Set, dtype: int64
NIH metadata shape for seed 18: (5241, 19)
NIH set count for seed 18:
train    3569
test     1156
val       516
Name: Set, dtype: int64
NIH metadata shape for seed 29: (5241, 19)
NIH set count for seed 29:
train    3712
test     1156
val       373
Name: Set, dtype: int64
NIH metadata shape for seed 30: (5241, 19)
NIH set count for seed 30:
train    3558
test     1156
val       527
Name: Set, dtype: int64
NIH metadata shape for seed 34: (5241, 19)
NIH set count for seed 34:
train    3680
test     1156
val       405
Name: Set, dtype: int64
NIH metadata shape for seed 41: (5241, 19)
NIH set count for seed 41:
train    3541
test     1156
val       544
Name: Set, dtype: int64
NIH metadata shape for seed 42: (5241, 19)
NIH set

In [4]:
# Creating aggregate metadata
import pandas as pd
import os
import numpy as np

parent = '/home/jkim/research/peds_cxr/'
metadata_source = parent + '25variation/metadata/preprocessed/'
output_path = parent + '25variation/metadata/preprocessed/aggregate/'

# Create the output folder if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Create aggregate_metadata for each of the 25 seeds
for seed in predefined_seeds:
    nih_metadata_path = f"{metadata_source}nih/nih_metadata_seed_{seed}.csv"
    vindr_metadata_path = f"{metadata_source}vindr/vindr_metadata_seed_{seed}.csv"
    
    nih_metadata = pd.read_csv(nih_metadata_path)
    vindr_metadata = pd.read_csv(vindr_metadata_path)
    
    aggregate_metadata = pd.concat([nih_metadata, vindr_metadata], ignore_index=True)

    aggregate_metadata.to_csv(output_path + f'aggregate_metadata_seed_{seed}.csv', index=False)

    print(f"Aggregate metadata shape for seed {seed}: {aggregate_metadata.shape}")
    print(f"Aggregate set count for seed {seed}:")
    print(aggregate_metadata['Set'].value_counts())


Aggregate metadata shape for seed 9: (13509, 19)
Aggregate set count for seed 9:
train    9288
test     2793
val      1428
Name: Set, dtype: int64
Aggregate metadata shape for seed 17: (13509, 19)
Aggregate set count for seed 17:
train    9283
test     2793
val      1433
Name: Set, dtype: int64
Aggregate metadata shape for seed 18: (13509, 19)
Aggregate set count for seed 18:
train    9405
test     2793
val      1311
Name: Set, dtype: int64
Aggregate metadata shape for seed 29: (13509, 19)
Aggregate set count for seed 29:
train    9488
test     2793
val      1228
Name: Set, dtype: int64
Aggregate metadata shape for seed 30: (13509, 19)
Aggregate set count for seed 30:
train    9347
test     2793
val      1369
Name: Set, dtype: int64
Aggregate metadata shape for seed 34: (13509, 19)
Aggregate set count for seed 34:
train    9527
test     2793
val      1189
Name: Set, dtype: int64
Aggregate metadata shape for seed 41: (13509, 19)
Aggregate set count for seed 41:
train    9374
test     27

In [5]:
# dropping labels with less than 2% prevalence from aggregate dataset:
# Load the data from the CSV file into a pandas DataFrame
output_folder = parent + "/25variation/metadata/final/vindr/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for seed in predefined_seeds:
    vindr_path = parent + f'25variation/metadata/preprocessed/vindr/vindr_metadata_seed_{seed}.csv'
    vindr_metadata = pd.read_csv(vindr_path)
    print('VinDR metadata shape before drop', vindr_metadata.shape)

    # List of all disease labels
    diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion',
                'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass/Nodule',
                'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

    dropped_records_count = {label: 0 for label in ['Fibrosis', 'Hernia', 'Atelectasis', 'Edema', 'Effusion', 'Emphysema', 'Pleural_Thickening', 'Pneumothorax']}

    # Identify rows to drop
    rows_to_drop = []
    for i, row in vindr_metadata.iterrows():
         if (
        (row['Fibrosis'] == 1 or row['Hernia'] == 1 or row['Edema'] == 1 or 
        row['Emphysema'] == 1 or row['Pleural_Thickening'] == 1 or 
        row['Atelectasis'] == 1 or row['Effusion'] == 1 or 
        row['Pneumothorax'] == 1) and
        all(row[disease] == 0 for disease in diseases 
            if disease not in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 
                            'Pleural_Thickening', 'Atelectasis', 'Effusion', 
                            'Pneumothorax'])
        ):
            rows_to_drop.append(i)
            for label in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 'Pleural_Thickening', 'Atelectasis', 'Effusion', 'Pneumothorax']:
                if row[label] == 1:
                    dropped_records_count[label] += 1
                
    # Drop the identified rows
    vindr_metadata.drop(rows_to_drop, axis=0, inplace=True)
    print('VinDR metadata shape after drop', vindr_metadata.shape)

    # Drop the specified columns
    vindr_metadata.drop(['Fibrosis', 'Hernia', 'Atelectasis', 'Edema', 'Effusion', 'Emphysema', 'Pleural_Thickening', 'Pneumothorax'], axis=1, inplace=True)
    print('metadata shape after columns drop', vindr_metadata.shape)

    # Save the metadata
    vindr_metadata.to_csv(output_folder + f"vindr_metadata_seed_{seed}.csv", index=False)

    # Print dropped records count
    print(f"Dropped records count for seed {seed}: {dropped_records_count}")

VinDR metadata shape before drop (8268, 19)
VinDR metadata shape after drop (8256, 19)
metadata shape after columns drop (8256, 11)
Dropped records count for seed 9: {'Fibrosis': 0, 'Hernia': 2, 'Atelectasis': 4, 'Edema': 0, 'Effusion': 3, 'Emphysema': 2, 'Pleural_Thickening': 0, 'Pneumothorax': 1}
VinDR metadata shape before drop (8268, 19)
VinDR metadata shape after drop (8256, 19)
metadata shape after columns drop (8256, 11)
Dropped records count for seed 17: {'Fibrosis': 0, 'Hernia': 2, 'Atelectasis': 4, 'Edema': 0, 'Effusion': 3, 'Emphysema': 2, 'Pleural_Thickening': 0, 'Pneumothorax': 1}
VinDR metadata shape before drop (8268, 19)
VinDR metadata shape after drop (8256, 19)
metadata shape after columns drop (8256, 11)
Dropped records count for seed 18: {'Fibrosis': 0, 'Hernia': 2, 'Atelectasis': 4, 'Edema': 0, 'Effusion': 3, 'Emphysema': 2, 'Pleural_Thickening': 0, 'Pneumothorax': 1}
VinDR metadata shape before drop (8268, 19)
VinDR metadata shape after drop (8256, 19)
metadata sh

In [8]:
import numpy as np
import pandas as pd
import os 

# File paths and setup
parent = '/home/jkim/research/peds_cxr/'
metadata_source = parent + 'peds_cxr_metadata/processed_metadata/'
output_path = parent + 'peds_cxr_metadata/final_metadata/'

# Predefined 25 random seeds
predefined_seeds = [9,17,18,29,30,34,41,42,51,52,61,66,74,75,78,81,84,86,87,89,90,92,96,98,99]
np.random.seed(42)

# dropping labels with less than 1% prevalence from nih dataset:
# Load the data from the CSV file into a pandas DataFrame
output_folder = parent + "/25variation/metadata/final/nih/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for seed in predefined_seeds:
    nih_path = parent + f'25variation/metadata/preprocessed/nih/nih_metadata_seed_{seed}.csv'
    nih_metadata = pd.read_csv(nih_path)
    print('NIH metadata shape before drop', nih_metadata.shape)

    # Initialize the dictionary to keep track of dropped records
    dropped_records_count = {'Fibrosis': 0, 'Hernia': 0, 'Edema': 0, 'Emphysema': 0, 'Pleural_Thickening': 0, 'Atelectasis':0, 'Effusion':0, 'Pneumothorax':0}

    diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion',
                'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass/Nodule',
                'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

    # Identify rows to drop
    rows_to_drop = []
    for i, row in nih_metadata.iterrows():
        if (
        (row['Fibrosis'] == 1 or row['Hernia'] == 1 or row['Edema'] == 1 or 
        row['Emphysema'] == 1 or row['Pleural_Thickening'] == 1 or 
        row['Atelectasis'] == 1 or row['Effusion'] == 1 or 
        row['Pneumothorax'] == 1) and
        all(row[disease] == 0 for disease in diseases 
            if disease not in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 
                            'Pleural_Thickening', 'Atelectasis', 'Effusion', 
                            'Pneumothorax'])
        ):
            rows_to_drop.append(i)
            for label in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 'Pleural_Thickening', 'Atelectasis', 'Effusion', 'Pneumothorax']:
                if row[label] == 1:
                    dropped_records_count[label] += 1

    # Drop the identified rows
    nih_metadata.drop(rows_to_drop, axis=0, inplace=True)
    print('NIH metadata shape after drop', nih_metadata.shape)

    # Drop the specified columns
    nih_metadata.drop(['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 'Pleural_Thickening', 'Atelectasis', 'Effusion', 'Pneumothorax'], axis=1, inplace=True)
    print('metadata shape after columns drop', nih_metadata.shape)

    # Save the metadata
    nih_metadata.to_csv(output_folder + f"nih_metadata_seed_{seed}.csv", index=False)

    # Print dropped records count
    print(f"Dropped records count for seed {seed}: {dropped_records_count}")

NIH metadata shape before drop (5241, 19)
NIH metadata shape after drop (4668, 19)
metadata shape after columns drop (4668, 11)
Dropped records count for seed 9: {'Fibrosis': 9, 'Hernia': 1, 'Edema': 37, 'Emphysema': 76, 'Pleural_Thickening': 58, 'Atelectasis': 146, 'Effusion': 175, 'Pneumothorax': 204}
NIH metadata shape before drop (5241, 19)
NIH metadata shape after drop (4668, 19)
metadata shape after columns drop (4668, 11)
Dropped records count for seed 17: {'Fibrosis': 9, 'Hernia': 1, 'Edema': 37, 'Emphysema': 76, 'Pleural_Thickening': 58, 'Atelectasis': 146, 'Effusion': 175, 'Pneumothorax': 204}
NIH metadata shape before drop (5241, 19)
NIH metadata shape after drop (4668, 19)
metadata shape after columns drop (4668, 11)
Dropped records count for seed 18: {'Fibrosis': 9, 'Hernia': 1, 'Edema': 37, 'Emphysema': 76, 'Pleural_Thickening': 58, 'Atelectasis': 146, 'Effusion': 175, 'Pneumothorax': 204}
NIH metadata shape before drop (5241, 19)
NIH metadata shape after drop (4668, 19)


In [9]:
# dropping labels with less than 2% prevalence from aggregate dataset:
# Load the data from the CSV file into a pandas DataFrame
output_folder = parent + "/25variation/metadata/final/aggregate/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for seed in predefined_seeds:
    aggregate_path = parent + f'25variation/metadata/preprocessed/aggregate/aggregate_metadata_seed_{seed}.csv'
    aggregate_metadata = pd.read_csv(aggregate_path)
    print('aggregate metadata shape before drop', aggregate_metadata.shape)

    # Initialize the dictionary to keep track of dropped records
    dropped_records_count = {'Fibrosis': 0, 'Hernia': 0, 'Edema': 0, 'Emphysema': 0, 'Pleural_Thickening': 0, 'Atelectasis':0, 'Effusion':0, 'Pneumothorax':0}
    # Identify rows to drop
    rows_to_drop = []
    for i, row in aggregate_metadata.iterrows():
        if (
        (row['Fibrosis'] == 1 or row['Hernia'] == 1 or row['Edema'] == 1 or 
        row['Emphysema'] == 1 or row['Pleural_Thickening'] == 1 or 
        row['Atelectasis'] == 1 or row['Effusion'] == 1 or 
        row['Pneumothorax'] == 1) and
        all(row[disease] == 0 for disease in diseases 
            if disease not in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 
                            'Pleural_Thickening', 'Atelectasis', 'Effusion', 
                            'Pneumothorax'])
        ):
            rows_to_drop.append(i)
            for label in ['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 'Pleural_Thickening', 'Atelectasis', 'Effusion', 'Pneumothorax']:
                if row[label] == 1:
                    dropped_records_count[label] += 1

    # Drop the identified rows
    aggregate_metadata.drop(rows_to_drop, axis=0, inplace=True)
    print('aggregate metadata shape after drop', aggregate_metadata.shape)

    # Drop the specified columns
    aggregate_metadata.drop(['Fibrosis', 'Hernia', 'Edema', 'Emphysema', 'Pleural_Thickening', 'Atelectasis', 'Effusion', 'Pneumothorax'], axis=1, inplace=True)
    print('metadata shape after columns drop', aggregate_metadata.shape)

    # Save the metadata
    aggregate_metadata.to_csv(output_folder + f"aggregate_metadata_seed_{seed}.csv", index=False)

    # Print dropped records count
    print(f"Dropped records count for seed {seed}: {dropped_records_count}")

aggregate metadata shape before drop (13509, 19)
aggregate metadata shape after drop (12924, 19)
metadata shape after columns drop (12924, 11)
Dropped records count for seed 9: {'Fibrosis': 9, 'Hernia': 3, 'Edema': 37, 'Emphysema': 78, 'Pleural_Thickening': 58, 'Atelectasis': 150, 'Effusion': 178, 'Pneumothorax': 205}
aggregate metadata shape before drop (13509, 19)
aggregate metadata shape after drop (12924, 19)
metadata shape after columns drop (12924, 11)
Dropped records count for seed 17: {'Fibrosis': 9, 'Hernia': 3, 'Edema': 37, 'Emphysema': 78, 'Pleural_Thickening': 58, 'Atelectasis': 150, 'Effusion': 178, 'Pneumothorax': 205}
aggregate metadata shape before drop (13509, 19)
aggregate metadata shape after drop (12924, 19)
metadata shape after columns drop (12924, 11)
Dropped records count for seed 18: {'Fibrosis': 9, 'Hernia': 3, 'Edema': 37, 'Emphysema': 78, 'Pleural_Thickening': 58, 'Atelectasis': 150, 'Effusion': 178, 'Pneumothorax': 205}
aggregate metadata shape before drop (

In [20]:
# Creating 25 folders with symlinks for training
import os 
for three in ['vindr', 'nih', 'aggregate']:
    for seeds in predefined_seeds:
        try: 
            os.makedirs(parent + '25variation/images/' + three + '/' + three + '_' + str(seeds))
        except FileExistsError:
            pass


for three in ['vindr', 'nih', 'aggregate']:
    for seeds in predefined_seeds:
        source_path = '/srv/store/jkim/peds_cxr/aggregate'
        symlink_path = parent + '25variation/images/' + three + '/' + three + '_' + str(seeds) + '/images'

        try: 
            os.symlink(source_path, symlink_path)
        except FileExistsError:
            pass


In [21]:
# checking if any images are missing in the folder

import pandas as pd
import os

def check_images_existence(csv_file_path, directory):
    # Read the CSV file
    metadata = pd.read_csv(csv_file_path)

    # List to keep track of missing files
    missing_files = []

    # Iterate through each image index in the dataset
    for image_index in metadata['Image Index']:
        # Construct the file path
        file_path = os.path.join(directory, f"{image_index}")

        # Check if the file exists
        if not os.path.isfile(file_path):
            missing_files.append(image_index)

    return missing_files

# CSV file path
csv_file_path = parent + 'peds_cxr_metadata/processed_metadata/aggregate_14labels.csv' 
# Directory containing the images
directory = '/srv/store/jkim/peds_cxr/aggregate/' 

# Check for missing images
missing_images = check_images_existence(csv_file_path, directory)

# Print the results
print(f"Number of missing files: {len(missing_images)}")
if missing_images:
    print("Sample of missing files:", missing_images[:10])  

Number of missing files: 0
