In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import glob
import pydicom
import cv2
import pickle
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.manifold import TSNE
import pickle

Prepares PNEUMONIA Images as Inputs for Models.

In [None]:
# Separate bacteria and viral pneumonia
for file in os.listdir('data/binary/pneumonia'):
    path = 'data/binary/pneumonia/{}'.format(file)
    if 'bacteria' in file:
        shutil.move(path, BPNEUMONIA_PATH)
    elif 'virus' in file:
        shutil.move(path, VPNEUMONIA_PATH)
    else:
        print(file)

In [23]:
# Clip dataset to balance classes
def remove_extra(PATH):
    record = set()
    i = 1
    for file in os.listdir(PATH):
        person = file.split('_')[0]
        if person not in record:
            record.add(person)
        else:
            os.remove(PATH + '/' + file)
        if i >= 400:
            os.remove(PATH + '/' + file)
        i += 1
remove_extra(BPNEUMONIA_PATH)
remove_extra(VPNEUMONIA_PATH)   

In [3]:
BPNEUMONIA_PATH = 'data/binary/bpneumonia'
ENHANCED_BPNEUMONIA_PATH  = 'data/binary/enhanced_bpneumonia'
FINAL_BPNEUMONIA_PATH = 'data/binary/final_bpneumonia'
BPNEUMONIA_SOURCES = ['chest_bpneumonia']
VPNEUMONIA_PATH = 'data/binary/vpneumonia'
ENHANCED_VPNEUMONIA_PATH  = 'data/binary/enhanced_vpneumonia'
FINAL_VPNEUMONIA_PATH = 'data/binary/final_vpneumonia'
VPNEUMONIA_SOURCES = ['chest_vpneumonia']
IMAGE_SIZE = (224, 224)

In [4]:
# Make folders needed
def mkdir(PATH): 
    if not os.path.exists(PATH): os.mkdir(PATH)
mkdir(BPNEUMONIA_PATH) 
mkdir(ENHANCED_BPNEUMONIA_PATH) 
mkdir(FINAL_BPNEUMONIA_PATH) 
mkdir(VPNEUMONIA_PATH) 
mkdir(ENHANCED_VPNEUMONIA_PATH) 
mkdir(FINAL_VPNEUMONIA_PATH) 

CLAHE Enhancing Reference

https://www.kaggle.com/seriousran/image-pre-processing-for-chest-x-ray?fbclid=IwAR0xYBmOLyPju9lelFGg0tKekpOmuoTu5haveUQsWrwRTS69Rj_hhymN1XE

The following process is repeated for both bacterial and viral 

In [7]:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16, 16))
for source in VPNEUMONIA_SOURCES:
    num = 0
    for file_path in glob.glob(os.path.join(VPNEUMONIA_PATH, source) + '/*.jpeg'):
        img = cv2.imread(file_path, 0)
        img_clahe = clahe.apply(img)
        path = os.path.join(ENHANCED_VPNEUMONIA_PATH, source)
        if not os.path.exists(path): os.mkdir(path)
        cv2.imwrite('{}/{}_{}.jpeg'.format(path, source, num), img_clahe)
        num += 1

DataGenerator to rotate, shear, zoom, rescale, and fit input shape

In [10]:
datagen = ImageDataGenerator(rescale=1./255,
                             featurewise_center = True,
                             featurewise_std_normalization = True,
                             width_shift_range=0.05,
                             height_shift_range=0.05,
                             shear_range=0.1,
                             zoom_range=0.05,
                             zca_whitening = True,
                             channel_shift_range = 0,
                             horizontal_flip = True,
                             vertical_flip = False,
                             validation_split = 0.2,
                             fill_mode='constant')

processed_pneumonia = datagen.flow_from_directory(ENHANCED_VPNEUMONIA_PATH,
                                               target_size=IMAGE_SIZE,
                                               shuffle=False,
                                               batch_size=1000,
                                               class_mode="categorical")

Found 400 images belonging to 1 classes.


In [11]:
# Grabs the dataset in matrix format X:(485, 224, 224, 3), y:(485,)
def batch_and_label(processed):
    for x_batches, y_batches in processed:
        X, y = x_batches, y_batches
        break
    return X, y
X, y = batch_and_label(processed_pneumonia)



In [53]:
# Saves the final output images
for i in range(len(X)):
    plt.imsave(os.path.join(FINAL_VPNEUMONIA_PATH, 'vpneumonia_{}.jpeg'.format(i)), X[i])
    
# Save dataset
pickle.dump(X, open("pickled_data/vpneumonia.pkl", "wb" ) )

No need for PCA/TSNE since they all come from the same data source