# Offline image preprocessing
I conducted experiments with different preprocessing approaches. To make the computations faster, I applied some of the preprocessing methods offline, before training and stored the preprocessed images in new dataset subdirectories.

In [36]:
import cv2
import glob
import os

from image_preprocessing import *
from utils import *

from tensorflow.keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt
%matplotlib inline

# Constants

In [70]:
MURA_DIR= '../datasets/original/' # Directory with original MURA dataset
CLAHE_2_DIR= '../datasets/clahe_2/' # Directory for CLAHE preprocessed dataset with clipLimit=2
CLAHE_10_DIR= '../datasets/clahe_10/' # Directory for CLAHE preprocessed dataset with clipLimit=10
DATASET_PATH= '../datasets/tvt_detailed_paths.csv' # Path to csv file with dataset information (train-valid-test split)

In [67]:
# Load full dataset dataframe, as we will preprocess all images
df = get_dataframe('ALL', 'ALL', DATASET_PATH)

# Defining functions

In [69]:
def clone_dir_structure(src, dest):
    """
    Clones full directory structure inside source path to destination path, excluding files

    Parameters
    ----------
    src: str
        Directory structure source
    dest: str
        Directory structure destination

    """
    for dirpath, dirnames, filenames in os.walk(src):
        folder = os.path.join(dest, dirpath[len(src):])
        if not os.path.isdir(folder):
            os.mkdir(folder)

            
def create_clahe_dataset(df, clip, src, dest):
    """
    Applies CLAHE method to all images provided in dataframe and saves them. 

    Parameters
    ----------
    df: pd.Dataframe
        Dataframe with one image per row, must have column "filepath" containing image filepath
    clip: float
        Clip limit used in the CLAHE method from opencv library
    src: str
        Source directory prefix for filepaths from dataframe, as they are not absolute
    dest: str
        Destination directory prefix for filepaths from dataframe, as they are not absolute
    """
    for filepath in df['filepath']:
        img = cv2.imread(src + filepath)
        clahe_img = clahe(img, clip=clip, tile=(8, 8))
        cv2.imwrite(dest + filepath, clahe_img)

# Create CLAHE dataset
## clipLimit=2

In [64]:
# Clone original MURA dataset structure, excluding files
clone_dir_structure(MURA_DIR, CLAHE_2_DIR)

# Create clahe preprocessed dataset with clipLimit=2
create_clahe_dataset(df, 2, MURA_DIR, CLAHE_2_DIR)

# Verify that all images were succesfully transfered (there should be no non-existing paths in the dataframe)
gen = ImageDataGenerator()
gen.flow_from_dataframe(dataframe=df,
                        directory=CLAHE_2_DIR,
                        x_col='filepath',
                        y_col='label',
                        class_mode='binary',)

Found 40005 validated image filenames belonging to 2 classes.


<keras.preprocessing.image.DataFrameIterator at 0x7f2ab2d5a100>

## clipLimit=10

In [72]:
# Clone original MURA dataset structure, excluding files
# clone_dir_structure(MURA_DIR, CLAHE_10_DIR)

# Create clahe preprocessed dataset with clipLimit=10
create_clahe_dataset(df, 10, MURA_DIR, CLAHE_10_DIR)

# Verify that all images were succesfully transfered (there should be no non-existing paths in the dataframe)
gen = ImageDataGenerator()
gen.flow_from_dataframe(dataframe=df,
                        directory=CLAHE_10_DIR,
                        x_col='filepath',
                        y_col='label',
                        class_mode='binary',)

Found 40005 validated image filenames belonging to 2 classes.


<keras.preprocessing.image.DataFrameIterator at 0x7f2ab2d5b310>