In [0]:
### Establish the baseline model - treat the whole problem as regression on 20 values (max 4 bounding boxes)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
cd '/content/drive/My Drive/Colab Notebooks/dml-project'

/content/drive/My Drive/Colab Notebooks/dml-project


In [0]:

### Pre-processing

In [0]:
import pickle
import pandas as pd
import numpy as np

In [0]:
df_train = pickle.load(open('./dftrain.pickle','rb'))
df_test = pickle.load(open('./dftest.pickle','rb'))

In [8]:
df_train.head()

Unnamed: 0,patientId,num_bounding_boxes,PatientAge,PatientSex,ViewPosition,class,noopacity_but_not_normal,target,bounding_boxes
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,51,F,PA,No Lung Opacity / Not Normal,1,0,[]
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,48,F,PA,No Lung Opacity / Not Normal,1,0,[]
2,00322d4d-1c29-4943-afc9-b6754be640eb,0,19,M,AP,No Lung Opacity / Not Normal,1,0,[]
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,28,M,PA,Normal,0,0,[]
4,00436515-870c-4b36-a041-de91049b9ab4,2,32,F,AP,Lung Opacity,0,1,"[[152.0, 264.0, 379.0, 213.0], [152.0, 562.0, ..."


In [0]:
def parse_data(df):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:

      parsed = {
        
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...

      }

    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': '../all/stage_1_train_images/%s.dcm' % pid,
                'label': row['Target'],
                'boxes': []}

        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

In [0]:
def create_bounding_rows(df_train):
    
    """
    Takes each patientId and creates a row of combined bounding boxes and 
    also includes their confidence scores. All patientId's are 
    included in one matrix.
    This fuction is based on a max of 4 bounding boxes per patientId.
    Output: Numpy matrix of shape(len(df_train), 20) 
    """
    
    # read in the dataframe that will be parsed by the function parse_data(df)
    df_boxes = \
    pd.read_csv('./all/stage_1_train_labels.csv')

    
    # set the length depending on how many bounding boxes we want the model to output
    length = 20
    
    h = np.ones(20)
    k = np.zeros(20)

    # create an empty numpy matrix matching the size of the output matrix
    y = np.zeros((len(df_train),length))

    # run the function
    # this must be here because this must be run each time this script is run or
    # the resulting matrix will have errors.
    parsed = parse_data(df_boxes)


    for i in range(0,len(df_train)):

        # get the patientId
        patientId = df_train.loc[i, 'patientId']

        # extract the bounding boxes for a particular patient
        box = parsed[patientId]['boxes']
        if len(box) == 0:

            # the first row becomes a dummy row of ones this must be deleted later
            # k is an array of zeros
            h = np.vstack((h,k))

        if len(box) != 0:

            # insert 1 as the first entry in each bounding box
            # the 1 represents confidence for that bounding box
            a=[]
            for i in range(0,len(box)):
                box[i].insert(0,1)
                a = a + box[i]

            # calculate how much padding to add
            b = length - len(a)

            # pad the list because not all lists have 4 bounding boxes
            # we want all lists to have the same length
            for i in range(0,b):
                a.insert(len(a),0)

            # reshape to horizontal because the above code makes the list vertical
            a = np.array(a).reshape(1,length)
            
            # stack
            h = np.vstack((h,a))

    # delete the first row because we added this row just to make the code run
    h = np.delete(h, 0, axis=0)
    
    return h


# call the function
box_rows = create_bounding_rows(df_train)

In [0]:
# concat box_rows with df_y

# put box_rows in a dataframe
df_y = pd.DataFrame(box_rows)

# rename the columns in df_box_rows
new_names = new_names = ['conf_1', 'x_1', 'y_1', 'width_1', 'height_1',
           'conf_2', 'x_2', 'y_2', 'width_2', 'height_2',
           'conf_3', 'x_3', 'y_3', 'width_3', 'height_3',
           'conf_4', 'x_4', 'y_4', 'width_4', 'height_4']
df_y.columns = new_names

# Let's choose only the first three bounding boxes for each sample
df_y = df_y[['conf_1', 'x_1', 'y_1', 'width_1', 'height_1',
           'conf_2', 'x_2', 'y_2', 'width_2', 'height_2',
            'conf_3', 'x_3', 'y_3', 'width_3', 'height_3',
            'conf_4', 'x_4', 'y_4', 'width_4', 'height_4']]

# add the patientId column to df_y
df_y['patientId'] = df_train['patientId']

In [0]:
# shuffle df_y
from sklearn.utils import shuffle
df_y = shuffle(df_y)

In [13]:
from numpy.random import seed
seed(101)
from tensorflow import set_random_seed
set_random_seed(101)

import pandas as pd
import numpy as np
import math
!pip install pydicom
import pydicom
import pylab
import os
import pickle

from sklearn.model_selection import train_test_split
from skimage.transform import resize

import matplotlib.pyplot as plt
%matplotlib inline


# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

import gc; gc.enable()



In [0]:
df_train_images, df_val_images = train_test_split(df_y, test_size=0.20,
                                                   random_state=5)

In [0]:
df_train_images.reset_index(inplace=True)
df_val_images.reset_index(inplace=True)
df_train = df_train_images.drop(['index', 'patientId'], axis=1)
df_val = df_val_images.drop(['index','patientId'], axis=1)

In [0]:
def train_generator(df_train_images, df_train, batch_size, num_rows, num_cols):
    
    '''
    Input: Dataframes, df_train_images and df_train
    
    Outputs one batch (X_train, y_train) on each iteration of the for loop.
    
    X_train:
    Reads images from a folder, converts the images to a numpy array 
    with shape: (batch_size, num_rows, num_cols, 1)
    
    y_train:
    Takes data from a pandas dataframe. Converts the data into a numpy array
    with shape (batch_size, num_rows, num_cols, 1)
    
    '''
    
    
    while True: 

        batch = []
        k = 0


        # note that we are rounding down.
        num_batches = math.ceil(df_train_images.shape[0]/batch_size)

        # create an empty numpy array matching the number of images
        image_array = np.zeros((batch_size,num_rows,num_cols))



        # this loop runs only once each time the next() function is called.
        for i in range(0,num_batches): # 20547 rows in train_images. we are using only 20000 of them

            if i < num_batches-1:

                # [1] Create X_train

                # carve out 1000 rows of the 'patientId' column
                batch = list(df_train_images['patientId'][k:(i+1)*batch_size])

                #for patientId in batch:
                for j in range(0,len(batch)):
                    patientId = batch[j]


                    path = \
                './all/stage_1_train_images/%s.dcm' % patientId

                    dcm_data = pydicom.read_file(path)

                    # get the image as a numpy array
                    image = dcm_data.pixel_array

                    # resize the image
                    small_image = resize(image,(num_rows,num_cols))

                    # add the image to the empty numpy array
                    image_array[j,:,:] = small_image

                # reshape the array and normalize
                X_train = image_array.reshape(batch_size,num_rows,num_cols,1)/255

                # [2] Create y_train

                # note: Here we use df_train instead of df_train_images
                # because we don't want the output to have the patientId column.

                # carve out 1000 rows
                y_train = df_train[k:(i+1)*batch_size]

                # convert to a numpy array
                y_train = y_train.values

            # to cater for the last batch i.e. the fractional part
            if i == num_batches-1: 

                batch_size_fractional = df_train.shape[0] - (batch_size*(num_batches-1)) # -1

                # create an empty numpy array matching the number of images
                image_array = np.zeros((batch_size_fractional,num_rows,num_cols))

                # select rows from the tail of df_test upwards
                batch1 = list(df_train_images['patientId'][-batch_size_fractional:]) #1000

                #for patientId in batch:
                for j in range(0,len(batch1)):
                    patientId = batch1[j]

                    path = \
            './all/stage_1_train_images/%s.dcm' % patientId

                    dcm_data = pydicom.read_file(path)

                    # get the image as a numpy array
                    image = dcm_data.pixel_array

                    # resize the image
                    small_image = resize(image,(num_rows,num_cols))

                    # add the image to the empty numpy array
                    image_array[j,:,:] = small_image

                # reshape the array and normalize
                X_train = image_array.reshape(batch_size_fractional,num_rows,num_cols,1)/255

                # [2] Create y_train

                # note: Here we use df_val instead of df_val_images
                # because we don't want the output to have the patientId column.

                # carve out 1000 rows
                y_train = df_train[-batch_size_fractional:]

                # convert to a numpy array
                y_train = y_train.values
                
                print(y_train.shape)


            k = k + batch_size

            # For testing the generator so we can see how many batches it outputs
            # by calling next(). Uncomment the next line for testing.
            #print(i)

            # Keras requires a tuple in the form (inputs,targets)
            yield (X_train.astype(np.float32), {'output1': y_train[:,[0,5,10,15]], 'output2': y_train[:,[1,2,3,4,6,7,8,9,11,12,13,14,16,17,18,19]]})

In [0]:
def val_generator(df_val_images, df_val, batch_size, num_rows, num_cols):
    
    '''
    Input: Dataframes, df_val_images and df_val
    
    Outputs one batch (X_val, y_val) on each iteration of the for loop.
    
    X_val:
    Reads images from a folder, converts the images to a numpy array 
    with shape: (batch_size, num_rows, num_cols, 1)
    
    y_val:
    Takes data from a pandas dataframe. Converts the data into a numpy array
    with shape (batch_size, num_rows, num_cols, 1)
    
    '''
    
    
    while True: 

        batch = []
        k = 0

        # note that we are rounding up.
        num_batches = math.ceil(df_val_images.shape[0]/batch_size)

        # Create an empty numpy array that matches the batch size.
        image_array = np.zeros((batch_size,num_rows,num_cols))


         # this loop runs only once each time the next() function is called.
        for i in range(0,num_batches): 
            
            if i < num_batches-1:

                # [1] Create X_train

                # carve out a batch of rows of the 'patientId' column
                batch = list(df_val_images['patientId'][k:(i+1)*batch_size])

                #for patientId in batch:
                for j in range(0,len(batch)):
                    patientId = batch[j]

                    path = \
            './all/stage_1_train_images/%s.dcm' % patientId

                    dcm_data = pydicom.read_file(path)

                    # get the image as a numpy array
                    image = dcm_data.pixel_array

                    # resize the image
                    small_image = resize(image,(num_rows,num_cols))

                    # add the image to the empty numpy array
                    image_array[j,:,:] = small_image

                # reshape the array and normalize
                X_val = image_array.reshape(batch_size,num_rows,num_cols,1)/255

                # [2] Create y_train

                # note: Here we use df_val instead of df_val_images
                # because we don't want the output to have the patientId column.

                # carve out 1000 rows
                y_val = df_val[k:(i+1)*batch_size]

                # convert to a numpy array
                y_val = y_val.values

             # to cater for the last batch i.e. the fractional part
            if i == num_batches-1: 

                batch_size_fractional = df_val.shape[0] - (batch_size*(num_batches-1)) 

                # create an empty numpy array matching the number of images
                image_array = np.zeros((batch_size_fractional,num_rows,num_cols))

                # select rows from the tail of df_test upwards
                batch1 = list(df_val_images['patientId'][-batch_size_fractional:]) 

                #for patientId in batch:
                for j in range(0,len(batch1)):
                    patientId = batch1[j]

                    path = \
            './all/stage_1_train_images/%s.dcm' % patientId

                    dcm_data = pydicom.read_file(path)

                    # get the image as a numpy array
                    image = dcm_data.pixel_array

                    # resize the image
                    small_image = resize(image,(num_rows,num_cols))

                    # add the image to the empty numpy array
                    image_array[j,:,:] = small_image

                # reshape the array and normalize
                X_val = image_array.reshape(batch_size_fractional,num_rows,num_cols,1)/255

                # [2] Create y_train

                # note: Here we use df_val instead of df_val_images
                # because we don't want the output to have the patientId column.

                # carve out a batch of rows
                y_val = df_val[-batch_size_fractional:]

                # convert to a numpy array
                y_val = y_val.values


            k = k + batch_size

            # For testing the generator so we can see how many batches it outputs
            # by calling next().
            #print(i)

            # Keras requires a tuple in the form (inputs,targets)
            yield (X_val.astype(np.float32), {'output1': y_val[:,[0,5,10,15]], 'output2': y_val[:,[1,2,3,4,6,7,8,9,11,12,13,14,16,17,18,19]]})

In [0]:
def test_generator(df_test, batch_size, num_rows, num_cols):
    
    """
    Input: Dataframe df_test.
    
    Outputs one batch (X_test) on each iteration of the for loop.
    
    X_test:
    Reads images from a folder, converts the images to a numpy array 
    with shape: (batch_size, num_rows, num_cols, 1)
    
    """

    batch = []
    k = 0
    
    # note that we are rounding up.
    num_batches = math.ceil(df_test.shape[0]/batch_size)

    # create an empty numpy array matching the number of images
    image_array = np.zeros((batch_size,num_rows,num_cols))
    
    # this loop runs only once each time the next() function is called.
    for i in range(0,num_batches):
        
        if i < num_batches-1:
        
            # [1] Create X_test

            # carve out a batch of rows of the 'patientId' column
            batch = list(df_test['patientId'][k:(i+1)*batch_size]) #1000

            #for patientId in batch:
            for j in range(0,len(batch)):
                patientId = batch[j]

                path = \
        './all/stage_1_test_images/%s.dcm' % patientId

                dcm_data = pydicom.read_file(path)

                # get the image as a numpy array
                image = dcm_data.pixel_array

                # resize the image
                small_image = resize(image,(num_rows,num_cols))

                # add the image to the empty numpy array
                image_array[j,:,:] = small_image

            # reshape the array and normalize
            X_test = image_array.reshape(batch_size,num_rows,num_cols,1)/255
            
        # to cater for the last batch i.e. the fractional part
        if i == num_batches-1: 
            
            batch_size_fractional = df_test.shape[0] - (batch_size*(num_batches - 1))
            
            # create an empty numpy array matching the number of images
            image_array = np.zeros((batch_size_fractional,num_rows,num_cols))
            
            # select rows from the tail of df_test upwards
            batch = list(df_test['patientId'][-batch_size_fractional:]) #1000

            
            for j in range(0,len(batch)):
                patientId = batch[j]

                path = \
        './all/stage_1_test_images/%s.dcm' % patientId

                dcm_data = pydicom.read_file(path)

                # get the image as a numpy array
                image = dcm_data.pixel_array

                # resize the image
                small_image = resize(image,(num_rows,num_cols))

                # add the image to the empty numpy array
                image_array[j,:,:] = small_image

            # reshape the array and normalize
            X_test = image_array.reshape(batch_size_fractional,num_rows,num_cols,1)/255
            
        
        # For testing the generator so we can see how many batches it outputs
        # by calling next(). Uncomment the next line for testing.
        #print(i)
        
        k = k + batch_size
        
        # Keras requires a tuple in the form (inputs,targets)
        yield (X_test.astype(np.float32))
    

In [0]:
########################
# INPUTS

# Set the batch sizes:

train_batch_size = 128
val_batch_size = 128
test_batch_size = 100

# Set the image size:

num_rows = 128
num_cols = 128

#########################

# train_generator
train_gen = \
train_generator(df_train_images, df_train, train_batch_size, num_rows, num_cols)

num_train_samples = df_train.shape[0]

num_train_batches = math.ceil(num_train_samples/train_batch_size) # round down


# val_generator
val_gen = \
val_generator(df_val_images, df_val, val_batch_size, num_rows, num_cols)

num_val_samples = df_val.shape[0]

num_val_batches = math.ceil(num_val_samples/val_batch_size) # round down

# test_generator
test_gen = \
test_generator(df_test, test_batch_size, num_rows, num_cols)

num_test_samples = df_test.shape[0]

num_test_batches = math.ceil(num_test_samples/test_batch_size) # round up

In [20]:
# Model Architecture


from keras.models import Sequential, Model, Input
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dense, Dropout, Flatten, UpSampling2D,  Multiply, Lambda, merge, Activation
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [0]:
input_layer = Input(shape=(num_rows, num_cols, 1))

x = BatchNormalization(momentum=0.99)(input_layer)
x = Conv2D(filters=32, kernel_size=(3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

#x = BatchNormalization(momentum=0.99)(x)
x = Conv2D(filters=32, kernel_size=(3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

#x = UpSampling2D(2**2)(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)

out1 = Dense(4, activation='sigmoid', name='output1')(x)
out2 = Dense(16, activation='relu', name='output2')(x)
model = Model(inputs=input_layer, outputs=[out1,out2])

In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 128, 128, 1)  0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 128, 128, 1)  4           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 126, 126, 32) 320         batch_normalization_1[0][0]      
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 63, 63, 32)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (

In [0]:
Adam_opt = Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-5)
model.compile(optimizer=Adam_opt, loss={'output1': 'binary_crossentropy', 'output2': 'mse'}, metrics=['accuracy'])

In [29]:
filepath = "model_3.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

history = model.fit_generator(generator=train_gen, 
                        steps_per_epoch=num_train_batches, 
                        epochs=3, 
                        verbose=1, 
                        callbacks=callbacks_list, 
                        validation_data=val_gen,
                        validation_steps=num_val_batches, 
                        class_weight=None, 
                        max_queue_size=10, 
                        workers=20,
                        use_multiprocessing=True, 
                        shuffle=False, 
                        initial_epoch=0)

Epoch 1/3


KeyboardInterrupt: ignored

In [24]:
df_test.head()

Unnamed: 0,patientId,PatientAge,PatientSex,ViewPosition
0,000924cf-0f8d-42bd-9158-1af53881a557,19,F,AP
1,000db696-cf54-4385-b10b-6b16fbb3f985,25,F,AP
2,000fe35a-2649-43d4-b027-e67796d412e0,40,M,AP
3,001031d9-f904-4a23-b3e5-2c088acd19c6,57,M,PA
4,0010f549-b242-4e94-87a8-57d79de215fc,56,M,PA


In [24]:
metric_gen = \
val_generator(df_val_images, df_val, 11, num_rows, num_cols)

model.load_weights(filepath = 'model_3.h5')
predictions = model.predict_generator(metric_gen, 
                                      steps=len(df_val_images)/11, 
                                      max_queue_size=5, 
                                      workers=20, 
                                      use_multiprocessing=True, 
                                      verbose=1)



In [0]:
first_outputs = predictions[0]
second_outputs = predictions[1]

In [0]:
predict = []
for i in range(len(first_outputs)):
  predict.append([np.concatenate((np.array([first_outputs[i][j]]),second_outputs[i][4*j:4*j+4])) for j in range(4)])
predict = np.array(predict)

In [0]:
predict = predict.reshape((predict.shape[0],20))

In [28]:
predict.shape

(5137, 20)

In [0]:
# put the predictions into a dataframe
df_preds = pd.DataFrame(predict)

# add column names
new_names = ['conf_1', 'x_1', 'y_1', 'width_1', 'height_1',
            'conf_2', 'x_2', 'y_2', 'width_2', 'height_2',
           'conf_3', 'x_3', 'y_3', 'width_3', 'height_3',
            'conf_4', 'x_4', 'y_4', 'width_4', 'height_4']

df_preds.columns = new_names

# add the patientId column
df_preds['patientId'] = df_val_images['patientId']

# add the PredictionString column
df_preds['PredictionString'] = 0

In [30]:
df_preds.iloc[88]

conf_1                                          0.355816
x_1                                              81.2097
y_1                                              107.409
width_1                                          90.6456
height_1                                         54.9941
conf_2                                         0.0983598
x_2                                               42.289
y_2                                              50.1411
width_2                                           74.537
height_2                                         42.5337
conf_3                                       2.66318e-34
x_3                                                    0
y_3                                                    0
width_3                                                0
height_3                                               0
conf_4                                                 0
x_4                                                    0
y_4                            

In [0]:
# Version 2: Changes were made. See comments below.

def process_preds(df):
    
    limit = 0.3
    
    conf_1 = 0
    conf_2 = 0
    conf_3 = 0
    conf_4 = 0
    
    string_1 = ''
    string_2 = ''
    string_3 = ''
    string_4 = ''
    
    
    for i in range(0,len(df)):
      

        #get the conf scores
        conf_1 = df.loc[i,'conf_1'] # revised in Version 2
        conf_2 = df.loc[i,'conf_2'] # revised in Version 2
        conf_3 = df.loc[i,'conf_3'] # revised in Version 2
        conf_4 = df.loc[i,'conf_4'] # revised in Version 2



        if conf_1 >= limit:
            string_1 = \
            str(conf_1) + ' ' + str(round(df.loc[i,'x_1']))+ ' ' + \
            str(round(df.loc[i,'y_1']))+ ' ' + str(round(df.loc[i,'width_1']))+ ' ' + str(round(df.loc[i,'height_1']))
        else:
            string_1 = ''

        if conf_2 >= limit:
            string_2 = \
            str(conf_2) + ' ' + str(round(df.loc[i,'x_2']))+ ' ' + \
            str(round(df.loc[i,'y_2']))+ ' ' + str(round(df.loc[i,'width_2']))+ ' ' + str(round(df.loc[i,'height_2']))
        else:
            string_2 = ''

        if conf_3 >= limit:
            string_3 = \
            str(conf_3) + ' ' + str(round(df.loc[i,'x_3']))+ ' ' + \
            str(round(df.loc[i,'y_3']))+ ' ' + str(round(df.loc[i,'width_3']))+ ' ' + str(round(df.loc[i,'height_3']))          
        else:
            string_3 = ''

        if conf_4 >= limit:
            string_4 = \
            str(conf_4) + ' ' + str(round(df.loc[i,'x_4']))+ ' ' + \
            str(round(df.loc[i,'y_4']))+ ' ' + str(round(df.loc[i,'width_4']))+ ' ' + str(round(df.loc[i,'height_4']))
        else:
            string_4 = ''

        df.loc[i,'PredictionString']  = \
        string_1 + ' ' + string_2 + ' ' + string_3 + ' ' + string_4

    df_submission = df[['patientId', 'PredictionString']]
    
    return df_submission

# call the function
df_submission = process_preds(df_preds)


In [0]:
ID = df_preds['patientId']
preds = df_preds['PredictionString']

submission = pd.DataFrame({'patientId':ID, 
                           'PredictionString':preds, 
                          }).set_index('patientId')

submission.to_csv('pneu_keras_model.csv', columns=['PredictionString']) 

In [33]:
df_submission.iloc[0]

patientId           06bf9151-5732-4968-b198-f4109676cd55
PredictionString       0.3558158 81.0 107.0 91.0 55.0   
Name: 0, dtype: object

In [0]:
df_pred_images = df_preds[['patientId','x_1','y_1','width_1','height_1',	'x_2',	'y_2',	'width_2', 'height_2',
                         'x_3',	'y_3',	'width_3',	'height_3',	'x_4',	'y_4',	'width_4',	'height_4']].sort_values(by='patientId')
df_val_images_true = df_val_images[['patientId','x_1','y_1','width_1','height_1',	'x_2', 	'y_2', 'width_2', 'height_2',
                           'x_3',	'y_3',	'width_3',	'height_3',	'x_4',	'y_4',	'width_4',	'height_4']].sort_values(by='patientId')

In [35]:
df_pred_images.head()

Unnamed: 0,patientId,x_1,y_1,width_1,height_1,x_2,y_2,width_2,height_2,x_3,y_3,width_3,height_3,x_4,y_4,width_4,height_4
563,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,17.199091,16.392096,7.363505,10.344476,2.534106,0.672866,1.448818,0.830888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2544,00c0b293-48e7-4e16-ac76-9269ba535a62,52.162369,68.631348,53.51321,35.008194,23.380991,20.433475,39.83041,21.041119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1683,014b7b58-f641-4477-8bbc-ae6f337745d6,57.567154,70.600601,52.594418,39.32365,24.522358,26.725451,41.160927,22.665382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1290,01a7353d-25bb-4ff8-916b-f50dd541dccf,47.121159,56.946941,42.210941,32.52829,19.734274,24.484505,33.980392,18.591492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1403,01c09fb1-a917-46ee-8d94-44f844a4eb85,51.734386,67.074409,58.48531,35.059753,28.569372,35.192287,50.349396,28.6731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
y_true = df_val_images_true.drop('patientId',axis=1).as_matrix()
y_pred = df_pred_images.drop('patientId',axis=1).as_matrix()

In [0]:
# IoU metric functions using tf.py_func, as suggested in  Marsh's (@vbookshelf) kernel:
#   https://www.kaggle.com/vbookshelf/keras-iou-metric-implemented-without-tensor-drama

METRIC_THRESH = 0.3


def raw_iou(y_true, y_pred):
    results = []
    y_pred = y_pred > METRIC_THRESH
    for i in range(0,y_true.shape[0]):
        intersect = np.sum( y_true[i,:] * y_pred[i,:] )
        union = np.sum( y_true[i,:] ) + np.sum( y_pred[i,:] ) - intersect + 1e-7
        iou = np.mean((intersect/union)).astype(np.float32)
        iou = min(iou, 1)
        results.append( iou )
    return np.mean( results )

def IoU(y_true, y_pred):
    iou = tf.py_func(raw_iou, [y_true, y_pred], tf.float32)
    return iou

In [212]:
raw_iou(y_true, y_pred)

0.21667043435705852

In [0]:
y_true = y_true.reshape((5137,4,4))
y_pred = y_pred.reshape((5137,4,4))

In [0]:
box_A = df_pred_images.iloc[88][['x_1','y_1','width_1','height_1']]
box_B = df_val_images_true.iloc[88][['x_1','y_1','width_1','height_1']]

In [0]:
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
    yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
    
    if xA == 0 and yA == 0 and xB == 0 and yB == 0:
        return 0

    # compute the area of intersection rectangle
    interArea = (xB - xA) * (yB - yA)
    #print(interArea)

    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[0]+boxA[2] - boxA[0]) * (boxA[1]+boxA[3] - boxA[1])
    boxBArea = (boxB[0]+boxB[2] - boxB[0]) * (boxB[1]+boxB[3] - boxB[1])
    
    if boxAArea + boxBArea - interArea < interArea:
      return 0

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea + 1)
    #print(iou)


    # return the intersection over union value
    return iou

In [0]:
import itertools

In [0]:
results = []
for k in range(len(y_true)):
  for i in itertools.product(y_true[k], y_pred[k], range(1)):
    results.append(bb_intersection_over_union(i[0],i[1]))

In [80]:
bb_intersection_over_union(y_true[0][1], y_pred[0][1])

0

In [57]:
y_pred[0][0]

array([17.199091 , 16.392096 ,  7.3635054, 10.344476 ], dtype=float32)

In [83]:
bb_intersection_over_union([571.4, 275.1, 230.8, 476.2], [571.4, 275.1, 230.8, 476.2])

0.9999909014779276

In [85]:
results[:100]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [86]:
np.mean(results)

0.010423116708797739

In [11]:
y_true[0]

NameError: ignored