In [63]:
import pandas as pd
import numpy as np
#  import matplotlib.pyplot as plt
import pylab
import os
import pydicom
import random
import matplotlib.pyplot as plt
from glob import glob
from skimage.transform import resize
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from PIL import Image
from keras.applications.densenet import DenseNet121 as PTModel, preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import BatchNormalization, GlobalAveragePooling2D, Dense, Dropout, Input
# , AvgPool2D, Lambda, LocallyConnected2D, Conv2D, multiply, Flatten
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau\
    # , LearningRateScheduler
from keras.utils import Sequence
import keras_preprocessing.image as KPImage

# params we will probably want to do some hyperparameter optimization later

# ['InceptionV3', 'Xception', 'DenseNet169', 'VGG16']
BASE_MODEL = 'DenseNet121'
IMG_SIZE = (384, 384)  # [(224, 224), (384, 384), (512, 512), (640, 640)]
BATCH_SIZE = 24  # [1, 8, 16, 24]
DENSE_COUNT = 128  # [32, 64, 128, 256]
DROPOUT = 0.25  # [0, 0.25, 0.5]
LEARN_RATE = 1e-4  # [1e-4, 1e-3, 4e-3]
TRAIN_SAMPLES = 8000  # [3000, 6000, 15000]
TEST_SAMPLES = 800
USE_ATTN = False  # [True, False]

In [2]:
# Labels contains the target (1=pneumonia, 0=healthy) and bounding boxes
# if that patient has pneumonia
bbox_df = pd.read_csv('../input/stage_1_train_labels.csv')

# Detailed contains whether patient has lung opacity, image is not normal or
# they are healthy
det_class_df = pd.read_csv('../input/stage_1_detailed_class_info.csv')
det_class_df = det_class_df.groupby('patientId').head(1).reset_index()
# Join the two tables
comb_bbox_df = pd.merge(bbox_df, det_class_df, how='outer', on='patientId')
comb_bbox_df.head(3)

# Initialize paths to image directories
det_class_path = '../input/stage_1_detailed_class_info.csv'
bbox_path = '../input/stage_1_train_labels.csv'
dicom_dir = '../input/stage_1_train_images/'
path = '../input/stage_1_train_images/%s.dcm' % comb_bbox_df['patientId'][0]
dicom_header = pydicom.read_file(path, stop_before_pixels=True)

In [3]:
def get_header_info(patientId):
    """Function to apply to dataframe to insert all header info from DICOM
    image as separate columns

    :param patientId str: Unique ID
    :returns: A Pandas Series that will become a column
    """
    path = '../input/stage_1_train_images/%s.dcm' % patientId
    output = {'path': path}
    dicom_header = pydicom.read_file(path, stop_before_pixels=True)
    for value in dicom_header:
        output[value.name] = value.value
    return pd.Series(output)


# Extract all of the dicom header info as columns into a dataframe
#header_df = comb_bbox_df.apply(lambda x: get_header_info(x['patientId']), 1)
# Convert the age from string to int
#header_df['Patient\'s Age'] = header_df['Patient\'s Age'].map(int)

#header_df = header_df.groupby('Patient ID').head(1).reset_index()
# This contains all information from the header and from the label file
#image_full_df = pd.merge(header_df, comb_bbox_df,
#                         left_on='Patient ID', right_on='patientId')
# Columns are:
#   ['index_x', 'path', 'Specific Character Set', 'SOP Class UID',
#  'SOP Instance UID', 'Study Date', 'Study Time', 'Accession Number',
#  'Modality', 'Conversion Type', "Referring Physician's Name",
#  'Series Description', "Patient's Name", 'Patient ID',
#  "Patient's Birth Date", "Patient's Sex", "Patient's Age",
#  'Body Part Examined', 'View Position', 'Study Instance UID',
#  'Series Instance UID', 'Study ID', 'Series Number',
#  'Instance Number', 'Patient Orientation', 'Samples per Pixel',
#  'Photometric Interpretation', 'Rows', 'Columns', 'Pixel Spacing',
#  'Bits Allocated', 'Bits Stored', 'High Bit',
#  'Pixel Representation', 'Lossy Image Compression',
#  'Lossy Image Compression Method', 'patientId', 'x', 'y', 'width',
#  'height', 'Target', 'index_y', 'class']

#  Uncomment to show an example image
#  patientId = image_full_df['patientId'][0]
#  dcm_file = '../input/stage_1_train_images/%s.dcm' % patientId
#  dcm_data = pydicom.read_file(dcm_file)
#  im = dcm_data.pixel_array  # Numpy array containing uint8s, 1024 x 1024
#  pylab.imshow(im, cmap=pylab.cm.gist_gray)
#  pylab.axis('off')

In [64]:
# Labels contains the target (1=pneumonia, 0=healthy) and bounding boxes
# if that patient has pneumonia
bbox_df = pd.read_csv('../input/stage_1_train_labels.csv')

# Create new dataframe with a column bbox which contains the info about the bounding box
grouped_bbox_df = bbox_df
grouped_bbox_df['bbox'] = bbox_df[['x', 'y', 'width', 'height']].values.tolist()
grouped_bbox_df.sample(10)

Unnamed: 0,patientId,x,y,width,height,Target,bbox
12298,7dd3bda8-4887-4621-8d9d-4638f7e8aed8,,,,,0,"[nan, nan, nan, nan]"
5940,4af6cc2b-a2d6-42d2-9fa0-62e759d140ec,,,,,0,"[nan, nan, nan, nan]"
24147,d9245b95-1261-4dd6-88a8-2c9a69578b0e,659.0,322.0,163.0,263.0,1,"[659.0, 322.0, 163.0, 263.0]"
6721,50f99f53-c703-4bda-b7b6-abd6be693c7a,,,,,0,"[nan, nan, nan, nan]"
1726,1847b55c-2fed-494c-8d76-7f76630ba39f,,,,,0,"[nan, nan, nan, nan]"
11927,7adb12c0-b1b2-402e-9c58-5a8873eef61d,259.0,543.0,144.0,110.0,1,"[259.0, 543.0, 144.0, 110.0]"
22599,cc243a7f-1947-4a97-af61-f74948cd8bff,,,,,0,"[nan, nan, nan, nan]"
9726,69be961e-0fbb-480f-aa6a-5df88272b548,,,,,0,"[nan, nan, nan, nan]"
14632,904070bc-c210-47ef-b886-abb6b0dd7ee3,297.0,404.0,142.0,335.0,1,"[297.0, 404.0, 142.0, 335.0]"
4671,407bc983-cb9f-4559-9e82-8b559e466042,554.0,423.0,224.0,338.0,1,"[554.0, 423.0, 224.0, 338.0]"


In [65]:
raw_train_df, valid_df = train_test_split(bbox_df,
                                          test_size=0.25,
                                          random_state=2018,
                                          stratify=bbox_df['Target']
                                          )

# Used to generate a list of rows for the train and test sets, straitified so that they have
# the same number of positive and negative cases in the Target column to reduce bias
skf = StratifiedKFold(n_splits=2)  # two splits are training and validation
train_ids = []
valid_ids = []
# Extract patient id and target columns as lists
pid_vec = np.stack(grouped_bbox_df['patientId'].values)
target_vec = np.stack(grouped_bbox_df['Target'].values)

t_ids, v_ids = next(skf.split(pid_vec, target_vec))
train_ids, valid_ids = pid_vec[t_ids], pid_vec[v_ids]
train_tar, valid_tar = target_vec[t_ids], target_vec[v_ids]

# Percentage of patients in each dataset that are positive
print(sum(grouped_bbox_df['Target']) / len(grouped_bbox_df['Target']))
print(sum(train_tar) / len(train_tar))
print(sum(valid_tar) / len(valid_tar))

0.3092207389009624
0.30923140609907546
0.30921007243877197


In [90]:
class generator(Sequence):
    def __init__(self,
                 folder,
                 df,
                 patientIds,
                 batch_size=32,
                 image_size=256,
                 do_shuffle=True,
                 augment=False,
                 predict=False
                 ):
        self.folder = folder
        self.df = df
        self.ids = patientIds
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = do_shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()

    def __load__(self, pid):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, str(pid) + ".dcm")).pixel_array
        # create empty mask
        msk = np.zeros(img.shape)
        # if image contains pneumonia
        boxes = self.df.loc[self.df['patientId'] == pid]['bbox']
        if boxes:
            # loop through pneumonia
            for box in boxes:
                # add 1's at the location of the pneumonia
                x, y, w, h = box
                msk[y:y + h, x:x + w] = 1
        # resize both image and mask
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.5
        # if augment then horizontal flip half the time
        if self.augment and random.random() > 0.5:
            img = np.fliplr(img)
            msk = np.fliplr(msk)
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)
        return img, msk

    def __loadpredict__(self, pid):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, str(pid) + ".dcm")).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img

    def __getitem__(self, index):
        # select batch
        id_batch = self.ids[index * self.batch_size:(index + 1) * self.batch_size]
        # predict mode: return images and filenames
        if self.predict:
            # load files
            imgs = [self.__loadpredict__(patientId) for patientId in id_batch]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, filenames
        # train mode: return images and masks
        else:
            # load files
            items = [self.__load__(patientId) for patientId in id_batch]
            # unzip images and masks
            imgs, msks = zip(*items)
            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
            return imgs, msks

    def on_epoch_end(self):
        #  if self.shuffle:
        #      random.shuffle(self.filenames)
        self.ids = shuffle(self.ids)  # sklearn shuffle

    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.ids) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.df) / self.batch_size)

In [91]:
print(grouped_bbox_df.loc[grouped_bbox_df['patientId'] == '7dd3bda8-4887-4621-8d9d-4638f7e8aed8']['bbox'])
folder = '../input/stage_1_train_images'
train_gen = generator(folder,
                      grouped_bbox_df,
                      train_ids,
                      batch_size=32,
                      image_size=256,
                      do_shuffle=True,
                      augment=True,
                      predict=False
                      )
valid_gen = generator(folder,
                      grouped_bbox_df,
                      valid_ids,
                      batch_size=32,
                      image_size=256,
                      do_shuffle=True,
                      augment=True,
                      predict=False
                      )

# Get a sample output from the generator to analyze
train_src, train_tar = train_gen.__getitem__(2)
valid_src, valid_tar = next(valid_gen)

# Display the sample
print(train_src.shape, train_tar.shape)
fig, m_axs = plt.subplots(2, 4, figsize=(16, 8))
for (c_x, c_y, c_ax) in zip(train_src, train_tar, m_axs.flatten()):
    c_ax.imshow(c_x[:, :, 0], cmap='bone')
    title = ''
    if c_x == c_y:
        title = 'Healthy'
    else:
        title = 'Pneumonia'
    c_ax.set_title('%s' % title)
    c_ax.axis('off')

12298    [nan, nan, nan, nan]
Name: bbox, dtype: object


KeyError: 0