In [21]:
import os
import pandas as pd
import math
import numpy as np
import tensorflow as tf

from datetime import datetime
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization, Conv2D, Dense, Dropout, Flatten, MaxPooling2D
from keras import backend as K
from keras import regularizers
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from matplotlib.pyplot import imshow
from PIL import Image
from sklearn.metrics import roc_auc_score

K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [2]:
datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True)

# The training set just has blanks instead of 0s
train_labels = pd.read_csv("../CheXpert-v1.0-small/train.csv").fillna(0)
train_labels["Path"] = '../' + train_labels["Path"]
validation_labels = pd.read_csv('../CheXpert-v1.0-small/valid.csv')
validation_labels["Path"] = '../' + validation_labels["Path"]

In [13]:
# Filter out Lateral images.  We'll train two models -> one for lateral and one for frontal

frontal_train_labels = train_labels[train_labels['Frontal/Lateral'] == 'Frontal']
frontal_validation_labels = validation_labels[validation_labels['Frontal/Lateral'] == 'Frontal']

# Filter out uncertains in the training dataset.  There are no uncertains in the validation dataset.
frontal_train_labels = frontal_train_labels[frontal_train_labels["Pleural Effusion"] != -1.0]

In [4]:
frontal_train_labels.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,../CheXpert-v1.0-small/train/patient00001/stud...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,../CheXpert-v1.0-small/train/patient00002/stud...,Female,87,Frontal,AP,0.0,0.0,-1.0,1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,1.0,0.0
2,../CheXpert-v1.0-small/train/patient00002/stud...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,../CheXpert-v1.0-small/train/patient00003/stud...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,../CheXpert-v1.0-small/train/patient00004/stud...,Female,20,Frontal,PA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
frontal_validation_labels.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,../CheXpert-v1.0-small/valid/patient64541/stud...,Male,73,Frontal,AP,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,../CheXpert-v1.0-small/valid/patient64542/stud...,Male,70,Frontal,PA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,../CheXpert-v1.0-small/valid/patient64543/stud...,Male,85,Frontal,AP,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,../CheXpert-v1.0-small/valid/patient64544/stud...,Female,42,Frontal,AP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,../CheXpert-v1.0-small/valid/patient64545/stud...,Female,55,Frontal,AP,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [6]:
frontal_train_labels.describe()

Unnamed: 0,Age,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
count,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0,186596.0
mean,60.629365,0.090967,-0.005177,0.08751,0.504893,0.031474,0.200149,-0.060596,-0.05939,0.006972,0.080173,0.357596,0.003848,0.036276,0.558115
std,17.82153,0.287562,0.317897,0.386847,0.499977,0.203722,0.532046,0.433404,0.322723,0.553443,0.317387,0.56987,0.148507,0.20045,0.506179
min,0.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,74.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
frontal_validation_labels.describe()

Unnamed: 0,Age,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
count,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0,202.0
mean,60.816832,0.128713,0.519802,0.326733,0.579208,0.00495,0.207921,0.158416,0.039604,0.371287,0.034653,0.316832,0.00495,0.0,0.490099
std,18.336303,0.335714,0.500849,0.470184,0.494913,0.07036,0.406828,0.366038,0.195511,0.484349,0.183355,0.466397,0.07036,0.0,0.501144
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,74.75,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [14]:
train_datagen = datagen.flow_from_dataframe(dataframe=frontal_train_labels,
                                            directory=".",
                                            x_col="Path",
                                            y_col=['Pleural Effusion'],
                                            class_mode = "raw",
                                            color_mode='grayscale',
                                            target_size=(100, 100),
                                            batch_size=32)
validation_datagen = datagen.flow_from_dataframe(dataframe=frontal_validation_labels,
                                                directory=".",
                                                x_col="Path",
                                                y_col=['Pleural Effusion'],
                                                class_mode = "raw",
                                                color_mode='grayscale',
                                                target_size=(100, 100),
                                                batch_size=32)

Found 181449 validated image filenames.
Found 202 validated image filenames.


In [23]:
# metrics functions

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [27]:
classifier = Sequential()

# 200x200 input
# Input is kinda large, so we select larger filters for the first layer to decrease 
# size of feature maps (and hopefully speed up training).

# Input: 100 x 100 x 1
classifier.add(Conv2D(32, (5, 5), input_shape=(100, 100, 1), use_bias=False))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 96 x 96 x 32
classifier.add(Conv2D(64, (3, 3)))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 94 x 94 x 64
classifier.add(MaxPooling2D(pool_size=(2,2)))
# Input: 47 x 47 x 64
classifier.add(Conv2D(64, (3, 3)))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 45 x 45 x 128
classifier.add(Conv2D(64, (3, 3)))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 43 x 43 x 128
classifier.add(MaxPooling2D(pool_size=(2,2)))
# Input: 21 x 21 x 128
classifier.add(Conv2D(128, (3, 3)))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 19 x 19 x 128
classifier.add(Conv2D(128, (3, 3)))
classifier.add(BatchNormalization())
classifier.add(Activation('relu'))
classifier.add(Dropout(.2))
# Input: 17 x 17 x 128
classifier.add(MaxPooling2D(pool_size=(2,2)))
# Input: 8 x 8 x 128
classifier.add(Flatten())

# Input: 8192 x 1024
classifier.add(Dense(activation="relu", units=1024))
classifier.add(Dense(activation="relu", units=1024))
classifier.add(Dense(activation="sigmoid", units=1))

classifier.compile(optimizer=Adam(learning_rate=0.002), loss='binary_crossentropy', metrics=[precision_m, recall_m, f1_m])

mc = ModelCheckpoint('trial_1/weights{epoch:04d}.h5', save_weights_only=True, save_best_only=True, monitor='val_loss', mode='min')
classifier.load_weights('trial_1/model0005.h5')
# os.makedirs('trial_1')

# too many epochs mean overfitting, not enough epochs mean underfitting
classifier.fit_generator(
    train_datagen,
    steps_per_epoch=5280,
    epochs=120,
    initial_epoch=5,
    validation_data=validation_datagen,
    validation_steps=800,
    workers=4,
    verbose=2,
    callbacks=[mc])

Epoch 6/120


KeyboardInterrupt: 

```
Epoch 1/120
 - 1824s - loss: 0.6324 - precision_m: 0.6168 - recall_m: 0.6097 - f1_m: 0.5961 - val_loss: 0.6089 - val_precision_m: 0.4197 - val_recall_m: 0.9118 - val_f1_m: 0.5645
Epoch 2/120
 - 1823s - loss: 0.5522 - precision_m: 0.6712 - recall_m: 0.6673 - f1_m: 0.6580 - val_loss: 0.7262 - val_precision_m: 0.4355 - val_recall_m: 0.9531 - val_f1_m: 0.5894
Epoch 3/120
 - 1822s - loss: 0.5300 - precision_m: 0.6879 - recall_m: 0.7028 - f1_m: 0.6855 - val_loss: 0.6404 - val_precision_m: 0.4670 - val_recall_m: 0.9524 - val_f1_m: 0.6174
Epoch 4/120
 - 1828s - loss: 0.5139 - precision_m: 0.6990 - recall_m: 0.7182 - f1_m: 0.6990 - val_loss: 0.6052 - val_precision_m: 0.4798 - val_recall_m: 0.9068 - val_f1_m: 0.6179
Epoch 5/120
 - 1830s - loss: 0.5070 - precision_m: 0.7055 - recall_m: 0.7209 - f1_m: 0.7037 - val_loss: 0.5689 - val_precision_m: 0.4781 - val_recall_m: 0.9096 - val_f1_m: 0.6170
 ```