In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from PIL import Image, ImageEnhance
# from scipy.ndimage import convolve
from skimage.metrics import structural_similarity

from scripts.preprocessing import scale_range, crop_borders, get_best_rotation, histogram_equalization, unsharp_masking

# Initial Preprocessing Steps to make images easier to work with

In [2]:
# EDIT DIRECTORY VARIABLES AS NEEDED
# --- Main Directory: contains all folders/files
root = "S:/CheXpert/"
# --- This is the original root listed on the csv file paths
old_root = "CheXpert-v1.0/train/"
old_test_root = "test/"

# --- Input directory variables
source_train_root = f"{root}raw_data/CheXpert-v1.0 batch 4 (train 3)/"
source_valid_root = f"{root}raw_data/CheXpert-v1.0 batch 1 (validate & csv)/valid/"
source_test_root = f"{root}raw_data/test/"
train_root = f"{root}train/"
valid_root = f"{root}valid/"
test_root = f"{root}test/"

# --- train/valid/test csv
train_filepath = f"{root}/train_data.csv"
valid_filepath = f"{root}/valid_data.csv"
test_filepath = f"{root}/test_data.csv"

# --- Image sizes
dims = [224, 384, 512]

In [3]:
# Preprocessing variables
# --- Value range for scaling image array
scale_min = 0
scale_max = 255
crop_q1_threshold, crop_q3_threshold = np.quantile([i for i in range(scale_min,scale_max)], [0.25, 0.75])

# --- Threshold for cropping borders (50% chance to cut off any borders)
cutoff = 0.5
threshold_range = (scale_max - scale_min) * 0.6

In [4]:
# Load the training/validation csvs
train_df = pd.read_csv(train_filepath)
valid_df = pd.read_csv(valid_filepath)
test_df = pd.read_csv(test_filepath)

print(f"# rows in train_df: {len(train_df)}")
print(f"# rows in valid_df: {len(valid_df)}")
print(f"# rows in test_df: {len(test_df)}")

# rows in train_df: 39358
# rows in valid_df: 202
# rows in test_df: 518


In [5]:
# def pipeline1(img_arr, kernel, he_sigma, scale_min, scale_max):
#     """Scale the range, sharpen by convolving with a kernel, then equalize the histogram"""
#     output = scale_range(img_arr, scale_min, scale_max)
#     output = convolve(output, kernel)
#     output = histogram_equalization(output, scale_min, scale_max, he_sigma)
#     return output

def pipeline2(img_arr, weight, usm_sigma, he_sigma, scale_min, scale_max):
    """Scale the range, sharpen via unsharp masking, then equalize the histogram"""
    output = unsharp_masking(img_arr, usm_sigma, weight, scale_min, scale_max)
    output = histogram_equalization(output, scale_min, scale_max, he_sigma)
    return output

In [6]:
# Image enhancement variables
he_sigma = 5
usm_sigma = 10
weight = 1.2

## Preprocessing Steps:

* Scale the image values to the range [0-255]
<!-- * Crop out any border regions algorithmically -->
* Resize the training and validation images to (224x224), (384x284), (512x512)
<!-- * Find the 90-degree rotation which is closest to the average of a sample of 1000 x-ray images -->
* Convert the array to type uint8 for compatibility with Image
* Save the processed image as as jpeg file

### Preprocessing steps for enhanced images

* Scale the image values to the range [0-255]
<!-- * Crop out any border regions algorithmically -->
* Resize the training and validation images to (224x224), (384x284), (512x512)
<!-- * Find the 90-degree rotation which is closest to the average of a sample of 1000 x-ray images -->
* Sharpen the image using unsharp masking
* Equalize the histogram to increase contrast
* Convert the array to type uint8 for compatibility with Image
* Save the processed image as as jpeg file


In [7]:
%%time
# Preprocessing steps for the valid set
output_df = valid_df.copy()

input_paths = output_df["source_file_path"]
output_paths_list = [output_df[f"base{str(dim)}_file_path"] for dim in dims]
output_paths_list2 = [output_df[f"base{str(dim)}_file_path"].str[:-4] + "_usm.jpg" for dim in dims]

for output_paths,output_paths2,dim in zip(output_paths_list, output_paths_list2, dims):
    for i,(input_file_path, output_file_path, output_file_path2) in enumerate(zip(input_paths, output_paths, output_paths2)):
        with Image.open(input_file_path) as img:
            img_arr = np.array(Image.fromarray(img_arr).resize((dim, dim), resample=Image.Resampling.BILINEAR))
            img = Image.fromarray(scale_range(img_arr, scale_min, scale_max).astype(np.uint8))
            img.save(output_file_path, "JPEG", quality=90)
            #
            img_arr2 = pipeline2(img_arr, weight, usm_sigma, he_sigma, scale_min, scale_max)
            img2 = Image.fromarray(img_arr2.astype(np.uint8))
            img2.save(output_file_path2, "JPEG", quality=90)

CPU times: total: 1min 22s
Wall time: 1min 31s


In [8]:
%%time
# Preprocessing steps for the test set
output_df = test_df.copy()

input_paths = output_df["source_file_path"]
output_paths_list = [output_df[f"base{str(dim)}_file_path"] for dim in dims]
output_paths_list2 = [output_df[f"base{str(dim)}_file_path"].str[:-4] + "_usm.jpg" for dim in dims]

for output_paths,output_paths2,dim in zip(output_paths_list, output_paths_list2, dims):
    for i,(input_file_path, output_file_path, output_file_path2) in enumerate(zip(input_paths, output_paths, output_paths2)):
        with Image.open(input_file_path) as img:
            img_arr = np.array(img)
            img_arr = np.array(Image.fromarray(img_arr).resize((dim, dim), resample=Image.Resampling.BILINEAR))
            img = Image.fromarray(scale_range(img_arr, scale_min, scale_max).astype(np.uint8))
            img.save(output_file_path, "JPEG", quality=90)
            #
            img_arr2 = pipeline2(img_arr, weight, usm_sigma, he_sigma, scale_min, scale_max)
            img2 = Image.fromarray(img_arr2.astype(np.uint8))
            img2.save(output_file_path2, "JPEG", quality=90)

CPU times: total: 3min 26s
Wall time: 3min 50s


### (RUN THE BELOW CELL ONCE, MAY TAKE A FEW HOURS)

In [9]:
%%time
# Preprocessing steps for the train set
output_df = train_df.copy()

input_paths = output_df["source_file_path"]
output_paths_list = [output_df[f"base{str(dim)}_file_path"] for dim in dims]
output_paths_list2 = [output_df[f"base{str(dim)}_file_path"].str[:-4] + "_usm.jpg" for dim in dims]

for output_paths,output_paths2,dim in zip(output_paths_list, output_paths_list2, dims):
    for i,(input_file_path, output_file_path, output_file_path2) in enumerate(zip(input_paths, output_paths, output_paths2)):
        with Image.open(input_file_path) as img:
            img_arr = np.array(img)
            img_arr = np.array(Image.fromarray(img_arr).resize((dim, dim), resample=Image.Resampling.BILINEAR))
            img = Image.fromarray(scale_range(img_arr, scale_min, scale_max).astype(np.uint8))
            img.save(output_file_path, "JPEG", quality=90)
            #
            img_arr2 = pipeline2(img_arr, weight, usm_sigma, he_sigma, scale_min, scale_max)
            img2 = Image.fromarray(img_arr2.astype(np.uint8))
            img2.save(output_file_path2, "JPEG", quality=90)

CPU times: total: 5h 52s
Wall time: 6h 27min 20s
