In [3]:
import keras
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.models import Sequential

from keras.applications import ResNet50
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
from keras.models import load_model
from keras.utils.vis_utils import plot_model

import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib
matplotlib.use("Agg") # set the matplotlib backend so figures can be saved in the background
 
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [4]:
pathFileTrain = "CheXpert-v1.0-small/train.csv"
pathFileValid = "CheXpert-v1.0-small/valid.csv"

# import train data from train.csv into panda struct
dtrain = pd.read_csv(pathFileTrain)
# fill in blanks with 0s
dtrain = dtrain.fillna(0)

# import valid data from valid.csv into panda struct
dvalid = pd.read_csv(pathFileValid)
# fill in blanks with 0s
dvalid = dvalid.fillna(0)

# combine train and valid dataset
dtrain = dtrain.append(dvalid)

# remove lateral images
dtrain = dtrain[~dtrain[dtrain.columns[3]].str.contains("Lateral")]

# drop the columns for sex, age, frontal/lateral, and AP/PA
dtrain = dtrain.drop(["Sex", "Age", "Frontal/Lateral", "AP/PA"], axis=1)

# convert uncertain to positive (-1 to 1)
dtrain = dtrain.replace(-1,1)

print(dtrain.shape)
dtrain.describe().transpose()

(191229, 15)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
No Finding,191229.0,0.088899,0.284598,0.0,0.0,0.0,0.0,1.0
Enlarged Cardiomediastinum,191229.0,0.10238,0.303148,0.0,0.0,0.0,0.0,1.0
Cardiomegaly,191229.0,0.157706,0.364466,0.0,0.0,0.0,0.0,1.0
Lung Opacity,191229.0,0.516444,0.499731,0.0,0.0,1.0,1.0,1.0
Lung Lesion,191229.0,0.042614,0.201985,0.0,0.0,0.0,0.0,1.0
Edema,191229.0,0.321787,0.467163,0.0,0.0,0.0,1.0,1.0
Consolidation,191229.0,0.195556,0.396629,0.0,0.0,0.0,0.0,1.0
Pneumonia,191229.0,0.108059,0.310456,0.0,0.0,0.0,0.0,1.0
Atelectasis,191229.0,0.311972,0.463299,0.0,0.0,0.0,1.0,1.0
Pneumothorax,191229.0,0.10672,0.308758,0.0,0.0,0.0,0.0,1.0


In [5]:
# Split data into train/dev/test 0.8/0.1/0.1

# Randomize the data
dtotal = dtrain.sample(frac=1)

# Split data into traing (80%), valid (10%) and test (10%)
dvalid_size = round(0.1*dtotal.shape[0])
dtest_size = dvalid_size
dtrain_size = dtotal.shape[0] - dvalid_size - dtest_size

x_train = dtotal[0:dtrain_size]
x_valid = dtotal[dtrain_size:dtrain_size+dvalid_size]
x_test = dtotal[dtrain_size+dvalid_size:dtotal.shape[0]+1]


print("The total number of data is:", dtotal.shape[0])

print("The size of training set is:", x_train.shape[0])
print("The size of the dev set is:", x_valid.shape[0])
print("The size of the test set is:", x_test.shape[0])

The total number of data is: 191229
The size of training set is: 152983
The size of the dev set is: 19123
The size of the test set is: 19123


In [4]:
# Data generation 
# https://keras.io/preprocessing/image/#imagedatagenerator-class
train_datagen = ImageDataGenerator(rescale = 1./255)
valid_datagen = ImageDataGenerator(rescale = 1./255.)
test_datagen = ImageDataGenerator(rescale = 1./255.)

target_size = (224, 224)

train_generator = train_datagen.flow_from_dataframe(
        dataframe = x_train,
        directory = None,
        x_col = "Path",
        y_col = list(x_train.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 32)

valid_generator = valid_datagen.flow_from_dataframe(
        dataframe = x_valid,
        directory = None,
        x_col = "Path",
        y_col = list(x_valid.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 32)

test_generator = test_datagen.flow_from_dataframe(
        dataframe = x_test,
        directory = None,
        x_col = "Path",
        y_col = list(x_test.columns[1:15]),
        class_mode = "other",
        target_size = target_size,
        batch_size = 1)

Found 152983 validated image filenames.
Found 19123 validated image filenames.
Found 19123 validated image filenames.


In [5]:
# Creating the model architecture
# Use the ResNet50 model with pretrained weights from imagenet. Exclude the top layers.
model = ResNet50(include_top = False, weights='imagenet')
layer = model.output
# adding a 2D pooling layer
layer = GlobalAveragePooling2D()(layer)
# adding a fully connected layer
layer = Dense(1024, activation='relu')(layer)
# adding a fully connected layer
layer = Dense(512, activation='relu')(layer)
# adding a fully connected layer
layer = Dense(256, activation='relu')(layer)
# adding a softmax layer
predictions = Dense(14, activation='sigmoid')(layer)

Model_ResNet50 = Model(inputs=model.input, outputs=predictions)














In [6]:
# Use Adam optimization
adam = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
Model_ResNet50.compile(optimizer= adam, loss='binary_crossentropy', metrics=['accuracy'])

print(Model_ResNet50.summary())


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
______________

In [7]:
# Fitting data into the model architecture

num_epochs = 3

STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size
Model_ResNet50.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=num_epochs)

# Save the model!
Model_ResNet50.save("Model_ResNet50_FCs_frac=1_epoch=3_lr=00001.h5")

Epoch 1/3
Epoch 2/3
Epoch 3/3
 278/4780 [>.............................] - ETA: 1:06:57 - loss: 0.3264 - acc: 0.8587

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [8]:
model_Res = load_model('Model_ResNet50_FCs_frac=1_epoch=3_lr=00001.h5')
num_epochs = 3
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [10]:
# Assess performance
test_generator.reset()
pred = model_Res.predict_generator(test_generator, steps=STEP_SIZE_TEST)
pred_bool = (pred >= 0.5)

y_pred = np.array(pred_bool,dtype =int)

dtest = x_test.to_numpy()
y_true = np.array(dtest[:,1:15],dtype=int)

print(classification_report(y_true, y_pred,target_names=list(x_train.columns[1:15])))

                            precision    recall  f1-score   support

                No Finding       0.10      0.03      0.04      1668
Enlarged Cardiomediastinum       0.00      0.00      0.00      2032
              Cardiomegaly       0.15      0.03      0.05      2983
              Lung Opacity       0.52      0.68      0.59      9963
               Lung Lesion       0.00      0.00      0.00       794
                     Edema       0.33      0.33      0.33      6270
             Consolidation       1.00      0.00      0.00      3719
                 Pneumonia       0.10      0.01      0.01      2127
               Atelectasis       0.30      0.27      0.29      5925
              Pneumothorax       0.09      0.04      0.06      2034
          Pleural Effusion       0.46      0.57      0.50      8694
             Pleural Other       0.00      0.00      0.00       424
                  Fracture       0.01      0.00      0.00       766
           Support Devices       0.57      0.50

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [11]:
# Generate score and accuracy
score, accuracy = model_Res.evaluate_generator(test_generator, steps=STEP_SIZE_TEST)
print('Test score:', score)
print('Test accuracy:', accuracy)

Test score: 0.35800832069711674
Test accuracy: 0.8438604167223612
