In [1]:
import keras
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.models import Sequential

from keras.applications.densenet import DenseNet169
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
from keras.models import load_model
from keras.utils.vis_utils import plot_model

import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib
matplotlib.use("Agg") # set the matplotlib backend so figures can be saved in the background
 
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import multilabel_confusion_matrix

from time import time
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [2]:
pathFileTrain = "CheXpert-v1.0-small/train.csv"
pathFileValid = "CheXpert-v1.0-small/valid.csv"

# import train data from train.csv into panda struct
dtrain = pd.read_csv(pathFileTrain)
# fill in blanks with 0s
dtrain = dtrain.fillna(0)

# import valid data from valid.csv into panda struct
dvalid = pd.read_csv(pathFileValid)
# fill in blanks with 0s
dvalid = dvalid.fillna(0)

# combine train and valid dataset
dtrain = dtrain.append(dvalid)

# remove lateral images
dtrain = dtrain[~dtrain[dtrain.columns[3]].str.contains("Lateral")]

# drop the columns for sex, age, frontal/lateral, and AP/PA
dtrain = dtrain.drop(["Sex", "Age", "Frontal/Lateral", "AP/PA"], axis=1)

# convert uncertain to positive (-1 to 1)
dtrain = dtrain.replace(-1,1)

print(dtrain.shape)
dtrain.describe().transpose()

(191229, 15)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
No Finding,191229.0,0.088899,0.284598,0.0,0.0,0.0,0.0,1.0
Enlarged Cardiomediastinum,191229.0,0.10238,0.303148,0.0,0.0,0.0,0.0,1.0
Cardiomegaly,191229.0,0.157706,0.364466,0.0,0.0,0.0,0.0,1.0
Lung Opacity,191229.0,0.516444,0.499731,0.0,0.0,1.0,1.0,1.0
Lung Lesion,191229.0,0.042614,0.201985,0.0,0.0,0.0,0.0,1.0
Edema,191229.0,0.321787,0.467163,0.0,0.0,0.0,1.0,1.0
Consolidation,191229.0,0.195556,0.396629,0.0,0.0,0.0,0.0,1.0
Pneumonia,191229.0,0.108059,0.310456,0.0,0.0,0.0,0.0,1.0
Atelectasis,191229.0,0.311972,0.463299,0.0,0.0,0.0,1.0,1.0
Pneumothorax,191229.0,0.10672,0.308758,0.0,0.0,0.0,0.0,1.0


In [3]:
# Split data into train/dev/test 0.8/0.1/0.1

# Randomize the data
dtotal = dtrain.sample(frac=1)

# Split data into traing (80%), valid (10%) and test (10%)
dvalid_size = round(0.1*dtotal.shape[0])
dtest_size = dvalid_size
dtrain_size = dtotal.shape[0] - dvalid_size - dtest_size

x_train = dtotal[0:dtrain_size]
x_valid = dtotal[dtrain_size:dtrain_size+dvalid_size]
x_test = dtotal[dtrain_size+dvalid_size:dtotal.shape[0]+1]


print(dtotal.shape[0])

print(x_train.shape[0])
print(x_valid.shape[0])
print(x_test.shape[0])

191229
152983
19123
19123


In [4]:
# Data generation 
# https://keras.io/preprocessing/image/#imagedatagenerator-class
train_datagen = ImageDataGenerator(rescale = 1./255)
valid_datagen = ImageDataGenerator(rescale = 1./255.)
test_datagen = ImageDataGenerator(rescale = 1./255.)

target_size = (224, 224)

train_generator = train_datagen.flow_from_dataframe(
        dataframe = x_train,
        directory = None,
        x_col = "Path",
        y_col = list(x_train.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 32)

valid_generator = valid_datagen.flow_from_dataframe(
        dataframe = x_valid,
        directory = None,
        x_col = "Path",
        y_col = list(x_valid.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 32)

test_generator = test_datagen.flow_from_dataframe(
        dataframe = x_test,
        directory = None,
        x_col = "Path",
        y_col = list(x_test.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 1)

Found 152983 validated image filenames.
Found 19123 validated image filenames.
Found 19123 validated image filenames.


In [5]:
# Creating the model architecture
# Use the VGG19 model with pretrained weights from imagenet. Exclude the top layers.
model = DenseNet169(include_top = False, weights='imagenet')
layer1 = model.output
# adding a 2D pooling layer
layer1 = GlobalAveragePooling2D()(layer1)
# adding a fully connected layer
layer1 = Dense(1024, activation='relu')(layer1)
# adding a fully connected layer
layer1 = Dense(512, activation='relu')(layer1)
# adding a fully connected layer
layer1 = Dense(256, activation='relu')(layer1)
# adding a softmax layer
predictions = Dense(14, activation='sigmoid')(layer1)

Model_Dense169 = Model(inputs=model.input, outputs=predictions)











In [6]:
# Use Adam optimization
adam = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)

Model_Dense169.compile(optimizer= adam, loss='binary_crossentropy', metrics=['accuracy'])

print(Model_Dense169.summary())


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, None, None, 6 9408        zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, None, None, 6 256         conv1/conv[0][0]                 
______________

In [7]:
# Fitting data into the model architecture

STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size
FullModel = Model_Dense169.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=3)
Model_Dense169.save("Model_DenseNet169_FCs_frac=001_epoch=3_untrainablelayer.h5")

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
# Plotting the loss curve
plt.plot(FullModel.history['loss'])
plt.plot(FullModel.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig("losscurve_attempt5")

In [13]:
model_Dense = load_model('Model_DenseNet169_FCs_frac=001_epoch=3_untrainablelayer.h5')
num_epochs = 3
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [14]:
# Assess performance
test_generator.reset()
pred = model_Dense.predict_generator(test_generator, steps=STEP_SIZE_TEST)
pred_bool = (pred >= 0.5)

y_pred = np.array(pred_bool,dtype =int)

dtest = x_test.to_numpy()
y_true = np.array(dtest[:,1:15],dtype=int)

print(classification_report(y_true, y_pred,target_names=list(x_train.columns[1:15])))

                            precision    recall  f1-score   support

                No Finding       0.09      0.05      0.06      1758
Enlarged Cardiomediastinum       0.00      0.00      0.00      1919
              Cardiomegaly       0.17      0.11      0.13      3058
              Lung Opacity       0.51      0.65      0.57      9865
               Lung Lesion       0.08      0.00      0.00       799
                     Edema       0.33      0.18      0.24      6115
             Consolidation       0.23      0.01      0.01      3710
                 Pneumonia       0.15      0.01      0.01      2142
               Atelectasis       0.30      0.18      0.23      5917
              Pneumothorax       0.12      0.05      0.07      1990
          Pleural Effusion       0.45      0.51      0.48      8605
             Pleural Other       0.00      0.00      0.00       431
                  Fracture       0.04      0.00      0.00       775
           Support Devices       0.56      0.58

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [16]:
# Generate score and accuracy
score, accuracy = model_Dense.evaluate_generator(test_generator, steps=STEP_SIZE_TEST)
print('Test score:', score)
print('Test accuracy:', accuracy)

KeyboardInterrupt: 