In [1]:
import numpy as np
import pandas as pd
import os
from scripts.prepare_datasets import copy_directory_structure, get_valid_paths
from scripts.preprocessing import scale_range, histogram_equalization

# Setting up the environment and file folders

In [2]:
##### EDIT DIRECTORY VARIABLES
root = "S:/CheXpert"
source_train_folder_name = "CheXpert-v1.0 batch 4 (train 3)"
source_test_folder_name = "CheXpert-v1.0 batch 1 (validate & csv)"
train_labels_file_name = "train_cheXbert.csv"
test_labels_file_name = "valid.csv"

train_folder_name = "train"
test_folder_name = "test"
train2_folder_name = "train2"
test2_folder_name = "test2"
#####

### Instantiate Variables

# Source: This is where the raw image files are stored (the next level are patient folders)
source_train_root = f"{root}/{source_train_folder_name}/"
source_test_root  = f"{root}/{source_test_folder_name}/valid/"

# These are the output roots for file paths being added to the train/test files
train_root = f"{root}/{train_folder_name}/"
test_root = f"{root}/{test_folder_name}/"
train2_root = f"{root}/{train2_folder_name}/"
test2_root = f"{root}/{test2_folder_name}/"

# These are the file paths to the labels
train_file_path = f"{root}/{train_labels_file_name}"
test_file_path = f"{root}/{source_test_folder_name}/{test_labels_file_name}"

# This is the original root for the train/test csv files
base_path = "CheXpert-v1.0/train/"


In [3]:
### Create new directories if necessary
copy_directory_structure(source_train_root, train_root)
copy_directory_structure(source_test_root, test_root)
copy_directory_structure(source_train_root, train2_root)
copy_directory_structure(source_test_root, test2_root)

In [4]:
### Load the training/validation csvs
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 223414
# rows in test_df: 234


In [5]:
### Filter the training and validation data frames
# Loop through the directories and get paths that exist
valid_train_paths = get_valid_paths(source_train_root)
valid_test_paths = get_valid_paths(source_test_root)

# Extract the file paths from the base path
train_df["floating_file_path"] = train_df["Path"].str[len(base_path):]

test_df["floating_file_path"] = test_df["Path"].str[len(base_path):]

# Add new file paths
train_df["source_file_path"] = source_train_root + train_df["floating_file_path"]
train_df["train_file_path"] = train_root + train_df["floating_file_path"]
train_df["train2_file_path"] = train2_root + train_df["floating_file_path"]

test_df["source_file_path"] = source_test_root + test_df["floating_file_path"]
test_df["test_file_path"] = test_root + test_df["floating_file_path"]
test_df["test2_file_path"] = test2_root + test_df["floating_file_path"]

# # Filter the dfs for only valid paths from the source
train_df = train_df[train_df["source_file_path"].isin(valid_train_paths)]
test_df = test_df[test_df["source_file_path"].isin(valid_test_paths)]

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 40977
# rows in test_df: 234


In [6]:
### Filter both train and valid dfs for only specific variable values
train_df = train_df[train_df["Frontal/Lateral"]=="Frontal"].reset_index(drop=True)
test_df = test_df[test_df["Frontal/Lateral"]=="Frontal"].reset_index(drop=True)

### Filter out invalid images found during EDA
# train_filter_out = [f"{source_train_root}patient48043/study1/view2_frontal.jpg", 
#                     f"{source_train_root}patient44163/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient53320/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient57976/study1/view2_frontal.jpg"
#                     f"{source_train_root}patient55832/study2/view1_frontal.jpg",
#                     f"{source_train_root}patient64539/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient46703/study2/view1_frontal.jpg",
#                     f"{source_train_root}patient50284/study3/view1_frontal.jpg",
#                     f"{source_train_root}patient46278/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient54044/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient43440/study3/view1_frontal.jpg",
#                     f"{source_train_root}patient48945/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient60655/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient56384/study1/view1_frontal.jpg",
#                     f"{source_train_root}patient45258/study3/view1_frontal.jpg",
#                     f"{source_train_root}patient46319/study3/view1_frontal.jpg",
#                     f"{source_train_root}patient51052/study1/view2_frontal.jpg",
#                     f"{source_train_root}patient51479/study1/view1_frontal.jpg"
#                     ]

train_df = train_df[~train_df["source_file_path"].isin(train_filter_out)]

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in test_df: {len(test_df)}")

NameError: name 'train_filter_out' is not defined

In [None]:
### Save new training and validation csvs
train_df.to_csv(f"{root}/train_data.csv", index=False)

# Rename the validation csv to test
test_df.to_csv(f"{root}/test_data.csv", index=False)