In [16]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from PIL import Image

from scripts.preprocessing import xdog, filter_out_diaphragm, scale_range, histogram_equalization

In [17]:
##### EDIT DIRECTORY VARIABLES
root = "S:/CheXpert"
source_train_folder_name = "CheXpert-v1.0 batch 4 (train 3)"
source_test_folder_name = "CheXpert-v1.0 batch 1 (validate & csv)"

train_filepath = f"{root}/train_data.csv"
test_filepath = f"{root}/test_data.csv"

train_folder_name = "train"
test_folder_name = "test"
train2_folder_name = "train2"
test2_folder_name = "test2"

#####

### Instantiate Variables

# Source: This is where the raw image files are stored (the next level are patient folders)
source_train_root = f"{root}/{source_train_folder_name}/"
source_test_root  = f"{root}/{source_test_folder_name}/valid/"

# These are the output roots for file paths being added to the train/test files
train_root = f"{root}/{train_folder_name}/"
test_root = f"{root}/{test_folder_name}/"
train2_root = f"{root}/{train2_folder_name}/"
test2_root = f"{root}/{test2_folder_name}/"

# This is the original root for the train/test csv files
base_path = "CheXpert-v1.0/train/"


In [18]:
### Load the training/validation csvs
train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 39371
# rows in test_df: 202


## Preprocessing Steps:

These are performed for both the regular sets and the preprocessed sets

* Scale the image values to the range [0-255]
* Crop out any borders algorithmically
* Resize the training and validation images to 512x512
* Find the 90-degree rotation which is closest to the average of a sample of 1000 x-ray images

These additional actions are performed for the preprocessed sets ONLY

* Adaptive mask to remove the diaphragm
* Increasing contrast via histogram equalization
* Edge detection?

In [19]:
### Preprocessing variables

### Value range for scaling image array
scale_min = 0
scale_max = 255

### Desired image dimensions for analysis
image_height = 512
image_width = 512

# difference_of_gaussians variables
dog_low_sigma = 0.5
dog_high_sigma = 100 * dog_low_sigma #50
dog_truncate = 1

# custom gaussian variables
xdog_k = 50    ### k scales Gaussians - this increases contrast
xdog_sigma = 0.5     ### sigma determines the initial blur strength - higher values discard finer details
xdog_tau = 10     ### tau scales the values of the second blurred image - higher values increase sharpness
xdog_ep = 0.75 * (scale_max - scale_min)     ### ep is the cutoff that determines what values become hard edges
xdog_phi = 0.02      ### phi is a parameter in tanh that affects how much the darker areas can be seen - smaller values retain more detail
xdog_threshold = "tanh"     ### threshold type: None, binary, tanh
xdog_truncate = 1      ### truncate: This determines the kernel of the Gaussian filter which affects smoothing

# histogram equalization
he_sigma = 5

# filtering out diaphragm
mask_threshold = 0.9

# bilateral filtering for denoising
db_sigma = 1.1

### Regular sets

In [25]:
### Preprocessing steps for the regular test set
input_paths = test_df["test_file_path"]
output_paths = test_df["test2_file_path"]

for i,(input_file_path, output_file_path) in enumerate(zip(input_paths, output_paths)):
    with Image.open(input_file_path) as img:
        img_arr = np.array(img)
        img_arr = scale_range(img_arr, scale_min, scale_max)
        img_arr = crop_borders(img_arr, threshold_range)
        img_arr = np.array(Image.fromarray(img_arr).resize((image_height, image_width)))
        img_arr, dist = get_best_rotation(img_arr, avg_img_arr)
        img = Image.fromarray(img_arr.astype(np.uint8))
        img.save(output_file_path, "JPEG", quality=90)

In [10]:
### Preprocessing steps for the regular training set (RUN THIS ONCE, MAY TAKE A FEW HOURS)
input_paths = train_df["train_file_path"]
output_paths = train_df["train2_file_path"]

for i,(input_file_path, output_file_path) in enumerate(zip(input_paths, output_paths)):
    with Image.open(input_file_path) as img:
        img_arr = np.array(img)
        img_arr = scale_range(img_arr, scale_min, scale_max)
        img_arr = crop_borders(img_arr, threshold_range)
        img_arr = np.array(Image.fromarray(img_arr).resize((image_height, image_width)))
        img_arr, dist = get_best_rotation(img_arr, avg_img_arr)
        img = Image.fromarray(img_arr.astype(np.uint8))
        img.save(output_file_path, "JPEG", quality=90)