In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
chexpert_labels = pd.read_csv('/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 1 (validate & csv)/train.csv')

In [5]:
label_columns = chexpert_labels.columns[5:5+14]

## attach demo labels to per-image metadata

In [7]:
demo_data = pd.read_excel('/home/joseph/datasets/chexpertchestxrays-u20210408/CHEXPERT DEMO.xlsx')

In [8]:
# Step 1: Extract patient number from Path and create a new column 'patient_number'
chexpert_labels['patient_number'] = chexpert_labels['Path'].apply(lambda x: x.split('train/')[1].split('/')[0])

# Step 2: Define a function to look up race and ethnicity based on patient_number
def get_race_ethnicity(patient_number):
    data = demo_data[demo_data['PATIENT'] == patient_number][['PRIMARY_RACE', 'ETHNICITY']]
    if not data.empty:
        return data.iloc[0]['PRIMARY_RACE'], data.iloc[0]['ETHNICITY']
    else:
        return None, None

# Step 3: Apply the function to create 'race' and 'ethnicity' columns
chexpert_labels[['race', 'ethnicity']] = chexpert_labels['patient_number'].apply(
    lambda x: pd.Series(get_race_ethnicity(x))
)

## create pneumonia demo/test sets for race

In [10]:
# Define the labels for filtering
white_labels = ['White', 'White, non-Hispanic', 'White or Caucasian']
black_labels = ['Black or African American', 'Black, non-Hispanic']
non_hispanic_labels = ['Non-Hispanic/Non-Latino', 'Not Hispanic']

In [11]:
# Filter for black and white patients based on race and ethnicity
black_patients = chexpert_labels[
    (chexpert_labels['race'].isin(black_labels)) & (chexpert_labels['ethnicity'].isin(non_hispanic_labels))
]

white_patients = chexpert_labels[
    (chexpert_labels['race'].isin(white_labels)) & (chexpert_labels['ethnicity'].isin(non_hispanic_labels))
]

In [14]:
black_patients['binary_race'] = ['Black']*black_patients.shape[0]
white_patients['binary_race'] = ['White']*white_patients.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  black_patients['binary_race'] = ['Black']*black_patients.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  white_patients['binary_race'] = ['White']*white_patients.shape[0]


In [17]:
(white_patients.Pneumonia == 1).sum()

np.int64(3156)

In [18]:
(black_patients.Pneumonia == 1).sum()

np.int64(341)

In [19]:
# Split black patients by unique patient numbers into demo and test sets
black_patient_numbers = black_patients['patient_number'].unique()
black_demo_patients, black_test_patients = train_test_split(black_patient_numbers, test_size=0.5, random_state=42)
black_demo_df = black_patients[black_patients['patient_number'].isin(black_demo_patients)]
black_test_df = black_patients[black_patients['patient_number'].isin(black_test_patients)]

# Split white patients by unique patient numbers into demo and test sets
white_patient_numbers = white_patients['patient_number'].unique()
white_demo_patients, white_test_patients = train_test_split(white_patient_numbers, test_size=0.5, random_state=42)
white_demo_df = white_patients[white_patients['patient_number'].isin(white_demo_patients)]
white_test_df = white_patients[white_patients['patient_number'].isin(white_test_patients)]

In [23]:
(black_test_df.Pneumonia == 1).sum(), (black_demo_df.Pneumonia == 1).sum()

(np.int64(173), np.int64(168))

In [26]:
black_and_pneumonia_demo = black_demo_df[black_demo_df.Pneumonia == 1].sample(150, random_state=42)
black_noPNA_demo = black_demo_df[black_demo_df.Pneumonia != 1].sample(150, random_state=42)

black_and_pneumonia_test = black_test_df[black_test_df.Pneumonia == 1].sample(150, random_state=42)
black_noPNA_test = black_test_df[black_test_df.Pneumonia != 1].sample(150, random_state=42)

white_and_pneumonia_demo = white_demo_df[white_demo_df.Pneumonia == 1].sample(150, random_state=42)
white_noPNA_demo = white_demo_df[white_demo_df.Pneumonia != 1].sample(150, random_state=42)

white_and_pneumonia_test = white_test_df[white_test_df.Pneumonia == 1].sample(150, random_state=42)
white_noPNA_test = white_test_df[white_test_df.Pneumonia != 1].sample(150, random_state=42)

In [27]:
# Combine the sampled data from black and white patients for demo and test sets
final_demo_df = pd.concat([black_and_pneumonia_demo, black_noPNA_demo,
                           white_and_pneumonia_demo, white_noPNA_demo]).reset_index(drop=True)
final_test_df = pd.concat([black_and_pneumonia_test, black_noPNA_test,
                           white_and_pneumonia_test, white_noPNA_test]).reset_index(drop=True)

In [28]:
##
## update paths because it does the weird batching thing
##

import os
from tqdm import tqdm

directories = [
    "/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 2 (train 1)",
    "/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 3 (train 2)",
    "/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 4 (train 3)"
]

# Initialize a dictionary to hold the range of patient numbers for each directory
patient_ranges = {}

# Find the range of patient numbers in each directory
for directory in directories:
    patient_numbers = []
    for patient_dir in tqdm(os.listdir(directory)):
        if os.path.isdir(os.path.join(directory, patient_dir)):
            # Extract the patient number
            patient_number = int(patient_dir.replace('patient', ''))
            patient_numbers.append(patient_number)
    
    # Calculate the range of patient numbers for this directory
    if patient_numbers:
        patient_ranges[directory] = (min(patient_numbers), max(patient_numbers))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 21513/21513 [00:00<00:00, 305842.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 21504/21504 [00:00<00:00, 310514.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 21523/21523 [00:00<00:00, 305454.44it/s]


In [29]:
patient_ranges

{'/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 2 (train 1)': (1,
  21513),
 '/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 3 (train 2)': (21514,
  43017),
 '/home/joseph/datasets/chexpertchestxrays-u20210408/CheXpert-v1.0 batch 4 (train 3)': (43018,
  64540)}

In [30]:
# Function to update the path
def update_path(original_path):
    # Extract the patient number from the path
    patient_number = int(original_path.split('/')[2].replace('patient', ''))
    
    # Determine the correct directory based on the patient number
    for directory, (min_num, max_num) in patient_ranges.items():
        if min_num <= patient_number <= max_num:
            # Replace the base path with the correct directory
            new_path = os.path.join(directory, original_path.split('train/')[1])
            return new_path
    
    # If no match is found, return the original path
    return original_path

In [31]:
# Apply the function to the DataFrame
final_demo_df['updated_path'] = final_demo_df['Path'].apply(update_path)

In [32]:
# Apply the function to the DataFrame
final_test_df['updated_path'] = final_test_df['Path'].apply(update_path)

In [33]:
###
### create zips for jacob

import zipfile
# Base directory for relative paths
base_directory = '/home/joseph/datasets/chexpertchestxrays-u20210408/'

output_zip = os.path.join(base_directory, 'chexpert_binary_PNA_demo_df.zip')

# Create the output directory if it doesn't exist
os.makedirs(base_directory, exist_ok=True)

# Create a ZipFile object and add the files while preserving the directory structure
with zipfile.ZipFile(output_zip, 'w') as zipf:
    for file_path in final_demo_df['updated_path']:
        if os.path.isfile(file_path):  # Check if the file exists
            # Calculate the relative path for each file
            relative_path = os.path.relpath(file_path, base_directory)
            # Add the file to the zip with the relative path
            zipf.write(file_path, relative_path)

In [34]:
output_zip = os.path.join(base_directory, 'chexpert_binary_PNA_test_df.zip')

# Create the output directory if it doesn't exist
os.makedirs(base_directory, exist_ok=True)

# Create a ZipFile object and add the files while preserving the directory structure
with zipfile.ZipFile(output_zip, 'w') as zipf:
    for file_path in final_test_df['updated_path']:
        if os.path.isfile(file_path):  # Check if the file exists
            # Calculate the relative path for each file
            relative_path = os.path.relpath(file_path, base_directory)
            # Add the file to the zip with the relative path
            zipf.write(file_path, relative_path)

In [35]:
final_test_df.to_csv(base_directory+'chexpert_binaryPNA_test_df_labels.csv')

In [36]:
final_demo_df.to_csv(base_directory+'chexpert_binaryPNA_demo_df_labels.csv')

In [37]:
final_test_df

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,...,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,patient_number,race,ethnicity,binary_race,updated_path
0,CheXpert-v1.0/train/patient50694/study1/view2_...,Female,77,Lateral,,,,1.0,1.0,,...,,,,,,patient50694,"Black, non-Hispanic",Non-Hispanic/Non-Latino,Black,/home/joseph/datasets/chexpertchestxrays-u2021...
1,CheXpert-v1.0/train/patient10775/study1/view1_...,Female,20,Frontal,PA,,,,,,...,,,,,,patient10775,Black or African American,Non-Hispanic/Non-Latino,Black,/home/joseph/datasets/chexpertchestxrays-u2021...
2,CheXpert-v1.0/train/patient22504/study1/view1_...,Female,24,Frontal,PA,,,,1.0,,...,,0.0,,,,patient22504,Black or African American,Non-Hispanic/Non-Latino,Black,/home/joseph/datasets/chexpertchestxrays-u2021...
3,CheXpert-v1.0/train/patient13994/study1/view2_...,Female,73,Lateral,,,,,,,...,,,,,,patient13994,Black or African American,Non-Hispanic/Non-Latino,Black,/home/joseph/datasets/chexpertchestxrays-u2021...
4,CheXpert-v1.0/train/patient28702/study9/view1_...,Male,60,Frontal,PA,,,1.0,-1.0,,...,,1.0,,,,patient28702,"Black, non-Hispanic",Non-Hispanic/Non-Latino,Black,/home/joseph/datasets/chexpertchestxrays-u2021...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,CheXpert-v1.0/train/patient36434/study2/view1_...,Female,67,Frontal,AP,,,,,,...,,,,,1.0,patient36434,White,Non-Hispanic/Non-Latino,White,/home/joseph/datasets/chexpertchestxrays-u2021...
596,CheXpert-v1.0/train/patient11447/study15/view1...,Male,48,Frontal,AP,,,,1.0,,...,0.0,1.0,,,1.0,patient11447,White,Non-Hispanic/Non-Latino,White,/home/joseph/datasets/chexpertchestxrays-u2021...
597,CheXpert-v1.0/train/patient19629/study2/view1_...,Male,51,Frontal,AP,,,,,,...,,1.0,,,1.0,patient19629,"White, non-Hispanic",Non-Hispanic/Non-Latino,White,/home/joseph/datasets/chexpertchestxrays-u2021...
598,CheXpert-v1.0/train/patient14860/study1/view2_...,Female,55,Lateral,,,0.0,,1.0,1.0,...,,0.0,,,,patient14860,White,Non-Hispanic/Non-Latino,White,/home/joseph/datasets/chexpertchestxrays-u2021...
