In [1]:
import numpy as np
import pandas as pd
import os
from scripts.prepare_datasets import copy_directory_structure, get_valid_paths
from scripts.preprocessing import scale_range, histogram_equalization

# Setting up the environment and file folders

In [2]:
# --- Main Directory: contains all folders/files
root = "S:/CheXpert/"
# --- This is the original root listed on the csv file paths
old_root = "CheXpert-v1.0/train/"
old_test_root = "test/"

# --- Input directory variables
source_train_root = f"{root}raw_data/CheXpert-v1.0 batch 4 (train 3)/"
source_valid_root = f"{root}raw_data/CheXpert-v1.0 batch 1 (validate & csv)/valid/"
source_test_root = f"{root}raw_data/test/"
train_labels = f"{root}raw_data/train_cheXbert.csv"
valid_labels = f"{root}raw_data/CheXpert-v1.0 batch 1 (validate & csv)/valid.csv"
test_labels = f"{root}raw_data/test_labels.csv"

# --- Output directory variables
train_root = f"{root}train/"
valid_root = f"{root}valid/"
test_root = f"{root}test/"

# --- Image sizes
dims = [224, 384, 512]

In [3]:
# --- Create new directories if necessary
copy_directory_structure(source_train_root, train_root)
copy_directory_structure(source_valid_root, valid_root)
copy_directory_structure(source_test_root, test_root)

In [4]:
# --- Load the training/validation csvs
train_df = pd.read_csv(train_labels)
valid_df = pd.read_csv(valid_labels)
test_df = pd.read_csv(test_labels)

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in valid_df: {len(valid_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 223414
# rows in valid_df: 234
# rows in test_df: 668


In [5]:
# --- Filter the training and validation data frames

# Loop through the directories and get paths that exist
valid_train_paths = get_valid_paths(source_train_root)
# Extract the file paths from the old_root
train_df["floating_file_path"] = train_df["Path"].str[len(old_root):]
# Add new file paths
train_df["source_file_path"] = source_train_root + train_df["floating_file_path"]
for dim in dims:
    dim, col1, col2 = str(dim), f"base{dim}_file_path", f"base{dim}_file_path2"
    train_df[col1] = train_root + train_df["floating_file_path"].str[:-4] + f"_{dim}.jpg"
    train_df[col2] = train_root + train_df[col1].str[:-4] + "_usm.jpg"
# Filter the dfs for only valid paths from the source
train_df = train_df[train_df["source_file_path"].isin(valid_train_paths)]

valid_validation_paths = get_valid_paths(source_valid_root)
valid_df["floating_file_path"] = valid_df["Path"].str[len(old_root):]
valid_df["source_file_path"] = source_valid_root + valid_df["floating_file_path"]
for dim in dims:
    dim, col1, col2 = str(dim), f"base{dim}_file_path", f"base{dim}_file_path2"
    valid_df[col1] = valid_root + valid_df["floating_file_path"].str[:-4] + f"_{dim}.jpg"
    valid_df[col2] = valid_root + valid_df[col1].str[:-4] + "_usm.jpg"
valid_df = valid_df[valid_df["source_file_path"].isin(valid_validation_paths)]

valid_test_paths = get_valid_paths(source_test_root)
test_df["floating_file_path"] = test_df["Path"].str[len(old_test_root):]
test_df["source_file_path"] = source_test_root + test_df["floating_file_path"]
for dim in dims:
    dim, col1, col2 = str(dim), f"base{dim}_file_path", f"base{dim}_file_path2"
    test_df[col1] = test_root + test_df["floating_file_path"].str[:-4] + f"_{dim}.jpg"
    test_df[col2] = test_root + test_df[col1].str[:-4] + "_usm.jpg"
test_df = test_df[test_df["source_file_path"].isin(valid_test_paths)]

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in valid_df: {len(valid_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 40977
# rows in valid_df: 234
# rows in test_df: 668


In [6]:
### Filter both train and valid dfs for only specific variable values
train_df = train_df[train_df["Frontal/Lateral"]=="Frontal"].reset_index(drop=True)
valid_df = valid_df[valid_df["Frontal/Lateral"]=="Frontal"].reset_index(drop=True)
test_df = test_df[test_df["Path"].str.find("frontal") > 0].reset_index(drop=True)

### Filter out invalid images found during EDA
train_filter_out = [f"{source_train_root}patient48043/study1/view2_frontal.jpg",
                    f"{source_train_root}patient44163/study1/view1_frontal.jpg",
                    f"{source_train_root}patient60655/study1/view1_frontal.jpg",
                    f"{source_train_root}patient52670/study1/view1_frontal.jpg",
                    f"{source_train_root}patient56384/study1/view1_frontal.jpg",
                    f"{source_train_root}patient52150/study1/view1_frontal.jpg",
                    f"{source_train_root}patient46703/study2/view1_frontal.jpg",
                    f"{source_train_root}patient56024/study1/view1_frontal.jpg",
                    f"{source_train_root}patient49936/study1/view1_frontal.jpg",
                    f"{source_train_root}patient50284/study3/view1_frontal.jpg",
                    f"{source_train_root}patient55832/study2/view1_frontal.jpg",
                    f"{source_train_root}patient62069/study1/view1_frontal.jpg",
                    f"{source_train_root}patient43440/study3/view1_frontal.jpg"
                    ]

# The unfiltered version is for EDA purposes
train_df_unfiltered = train_df.copy()
train_df = train_df[~train_df["source_file_path"].isin(train_filter_out)]

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in train_df_unfiltered: {len(train_df_unfiltered)}")
print(f"# rows in valid_df: {len(valid_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 39358
# rows in train_df_unfiltered: 39371
# rows in valid_df: 202
# rows in test_df: 518


In [7]:
### Save new training, validation, test csvs
train_df.to_csv(f"{root}train_data.csv", index=False)
train_df_unfiltered.to_csv(f"{root}train_df_unfiltered.csv", index=False)
valid_df.to_csv(f"{root}valid_data.csv", index=False)
test_df.to_csv(f"{root}test_data.csv", index=False)

In [8]:
train_df.columns

Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices', 'No Finding', 'floating_file_path',
       'source_file_path', 'base224_file_path', 'base224_file_path2',
       'base384_file_path', 'base384_file_path2', 'base512_file_path',
       'base512_file_path2'],
      dtype='object')

In [9]:
train_df.head(5)

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,...,Support Devices,No Finding,floating_file_path,source_file_path,base224_file_path,base224_file_path2,base384_file_path,base384_file_path2,base512_file_path,base512_file_path2
0,CheXpert-v1.0/train/patient43018/study1/view1_...,Female,53,Frontal,AP,,,1.0,,,...,,,patient43018/study1/view1_frontal.jpg,S:/CheXpert/raw_data/CheXpert-v1.0 batch 4 (tr...,S:/CheXpert/train/patient43018/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43018/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43018/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...
1,CheXpert-v1.0/train/patient43018/study2/view1_...,Female,53,Frontal,AP,,,1.0,,,...,,,patient43018/study2/view1_frontal.jpg,S:/CheXpert/raw_data/CheXpert-v1.0 batch 4 (tr...,S:/CheXpert/train/patient43018/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43018/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43018/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...
2,CheXpert-v1.0/train/patient43019/study2/view1_...,Male,77,Frontal,AP,,,1.0,,,...,,,patient43019/study2/view1_frontal.jpg,S:/CheXpert/raw_data/CheXpert-v1.0 batch 4 (tr...,S:/CheXpert/train/patient43019/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study2/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...
3,CheXpert-v1.0/train/patient43019/study1/view1_...,Male,77,Frontal,AP,,,1.0,,,...,1.0,,patient43019/study1/view1_frontal.jpg,S:/CheXpert/raw_data/CheXpert-v1.0 batch 4 (tr...,S:/CheXpert/train/patient43019/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study1/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...
4,CheXpert-v1.0/train/patient43019/study3/view1_...,Male,77,Frontal,AP,,,1.0,,0.0,...,1.0,,patient43019/study3/view1_frontal.jpg,S:/CheXpert/raw_data/CheXpert-v1.0 batch 4 (tr...,S:/CheXpert/train/patient43019/study3/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study3/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...,S:/CheXpert/train/patient43019/study3/view1_fr...,S:/CheXpert/train/S:/CheXpert/train/patient430...


In [10]:
train_df.describe()

Unnamed: 0,Age,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
count,39358.0,7113.0,7457.0,19871.0,1794.0,16341.0,11975.0,4490.0,13902.0,14620.0,21073.0,705.0,2203.0,22410.0,3556.0
mean,62.774099,-0.185013,0.547405,0.952745,0.698997,0.479346,-0.260877,-0.633853,-0.013092,0.150616,0.641247,0.192908,0.650023,0.96091,1.0
std,18.57616,0.735042,0.655275,0.2217,0.68587,0.74365,0.753978,0.722002,0.99613,0.436261,0.590268,0.976837,0.552908,0.195646,0.0
min,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
25%,51.0,-1.0,0.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,1.0,1.0
50%,64.0,0.0,1.0,1.0,1.0,1.0,0.0,-1.0,-1.0,0.0,1.0,1.0,1.0,1.0,1.0
75%,77.0,0.0,1.0,1.0,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
max,108.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39358 entries, 0 to 39370
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Path                        39358 non-null  object 
 1   Sex                         39358 non-null  object 
 2   Age                         39358 non-null  int64  
 3   Frontal/Lateral             39358 non-null  object 
 4   AP/PA                       39358 non-null  object 
 5   Enlarged Cardiomediastinum  7113 non-null   float64
 6   Cardiomegaly                7457 non-null   float64
 7   Lung Opacity                19871 non-null  float64
 8   Lung Lesion                 1794 non-null   float64
 9   Edema                       16341 non-null  float64
 10  Consolidation               11975 non-null  float64
 11  Pneumonia                   4490 non-null   float64
 12  Atelectasis                 13902 non-null  float64
 13  Pneumothorax                14620 no