In [155]:
CSV_PATH = 'physionet.org/files/mimic-cxr-jpg/2.0.0'
DATASET_PATH = 'physionet.org/files/mimic-cxr-jpg/2.0.0/files'
FOLDER_TO_TRAIN = '/hely/mimic-cxr-classification/tmp/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p11/'

In [91]:
import os
import random
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import numpy as np
import imutils
# keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=True, weights='imagenet', input_tensor=None, input_shape=None, pooling=None, classes=1000)

In [92]:
CHEXPERT_DATA = os.path.join(CSV_PATH, 'mimic-cxr-2.0.0-chexpert.csv.gz')
METADATA_DATA = os.path.join(CSV_PATH, 'mimic-cxr-2.0.0-metadata.csv.gz')
SPLIT_DATA = os.path.join(CSV_PATH, 'mimic-cxr-2.0.0-split.csv.gz')
NEGBIO_DATA = os.path.join(CSV_PATH, 'mimic-cxr-2.0.0-negbio.csv.gz')

In [93]:
dfCHE = pd.read_csv(CHEXPERT_DATA)
dfMeta = pd.read_csv(METADATA_DATA)
dfSplit = pd.read_csv(SPLIT_DATA)
dfNegBio = pd.read_csv(NEGBIO_DATA)



# dfNegBio['No Finding'] = dfNegBio['No Finding'].fillna(0)
# dfNegBio['No Finding'].unique()

In [94]:
dfNegBio = dfNegBio.fillna(0)

In [95]:
dfSplit['split'].unique()

array(['train', 'validate', 'test'], dtype=object)

In [96]:
len(dfNegBio.columns[2:])

14

In [97]:
len(dfCHE.columns[2:])

14

In [98]:
def mimic_data_generator(dfMeta, dfNegBio, dfSplit, generator_type, batch_size):
    while True:
        trainDf = dfSplit[dfSplit['split'] == generator_type]
        
        try:            
            random_indexes = [ random.randint(0, trainDf.shape[0] - 1) for i in range(batch_size)]
            rows = [ trainDf.iloc[index] for index in random_indexes]
            metaDfs = [ dfMeta[ dfMeta['subject_id'] == row['subject_id']][dfMeta['study_id'] == row['study_id']] for row in rows]
        except Exception as err:            
            random_indexes = [ random.randint(0, trainDf.shape[0] - 1) for i in range(batch_size)]
            rows = [ trainDf.iloc[index] for index in random_indexes]
            metaDfs = [ dfMeta[ dfMeta['subject_id'] == row['subject_id']][dfMeta['study_id'] == row['study_id']] for row in rows]
            print(err)

        images = []
        labels = []
        needToAdd = False
        for metaDf in metaDfs:
            for index, row in metaDf.iterrows():
                folder_path = os.path.join(DATASET_PATH, 'p'+str(row['subject_id'])[:2])
                folder_path = os.path.join(folder_path, 'p'+str(row['subject_id']))
                folder_path = os.path.join(folder_path, 's'+str(row['study_id']))
                image_path = os.path.join(folder_path, str(row['dicom_id'])+'.jpg')
                try:
                    image = cv2.imread(image_path)
                    image = cv2.resize(image, (224,224))
#                     print(image.shape)
                    images.append(image)
                    label = dfNegBio[ dfNegBio['subject_id'] == row['subject_id']][dfNegBio['study_id'] == row['study_id']].iloc[[0], 2:].values[0]
                    formed =  [int(abs(i)) for i in label]
                    labels.append(formed)
                except Exception as err:
#                     print(err)
                    try:
                        images.append(images[0])
                        labels.append(labels[0])
                    except Exception as err:
#                         print(err)
                        needToAdd = True

        if needToAdd:
            images.append(images[0])
            labels.append(labels[0])
        
        if len(images) != len(labels):
            labels = labels[0:len(images)]

        yield (np.array(images), np.array(labels))

In [163]:
import random
def mimic_data_generator_from_folder(dfNegBio, batch_size, folderpath):
    while True:

        images = []
        labels = []
        folders = os.listdir(folderpath)
        random.shuffle(folders)
        
        try:
            for subject_id in folders:
                studies_path = os.path.join(folderpath, subject_id)
                study_list = os.listdir(studies_path)
                random.shuffle(study_list)

                if len(images) >= batch_size:
                    break

                for study_id in study_list:
                    if study_id == 'index.html':
                        continue

                    if len(images) >= batch_size:
                        break


                    sub_id = subject_id.replace('p', '')
                    stud_id = study_id.replace('s', '')
                    label = dfNegBio[ dfNegBio['subject_id'] == int(sub_id)][dfNegBio['study_id'] == int(stud_id)].iloc[[0], 2:].values[0]                
                    imagefolder = os.path.join(studies_path, study_id)
                    for imagename in os.listdir(imagefolder):
                        if imagename == 'index.html':
                            continue

                        if len(images) >= batch_size:
                            break

                        imagepath = os.path.join(imagefolder, imagename)                    
                        image = cv2.imread(imagepath)
                        image = cv2.resize(image, (224,224))
                        image = image / 255

                        images.append(image)
                        formed =  [int(abs(i)) for i in label]
                        labels.append(formed)

        except: pass
                    


        if len(images) != len(labels):
            labels = labels[0:len(images)]

        yield (np.array(images), np.array(labels))

In [164]:
mimic_data_generator_from_folder(dfNegBio, 1, FOLDER_TO_TRAIN)

<generator object mimic_data_generator_from_folder at 0x7f5a8b99b830>

In [145]:
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K


In [146]:

# create the base pre-trained model
base_model = InceptionV3(weights='imagenet', include_top=False)


In [147]:
# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(len(dfNegBio.columns[2:]), activation='sigmoid')(x)


In [148]:

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)


In [149]:
# model.summary()

In [150]:
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False


In [151]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv2d_95 (Conv2D)              (None, None, None, 3 864         input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_95 (BatchNo (None, None, None, 3 96          conv2d_95[0][0]                  
__________________________________________________________________________________________________
activation_95 (Activation)      (None, None, None, 3 0           batch_normalization_95[0][0]     
____________________________________________________________________________________________

In [152]:
# from keras.models import load_model
 
# # load model
# model = load_model('resnet_backup.h5')

In [153]:

# model.add(Dense(128, activation='relu'))
# model.add(Dense(8, activation='sigmoid'))   # Final Layer using Softmax

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # compile the model (should be done *after* setting layers to non-trainable)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')


In [167]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.callbacks import LearningRateScheduler
batch_size = 32
num_epochs = 100
input_shape = (224, 224, 3)
validation_split = .2
verbose = 1
patience = 50

def scheduler(epoch):
    if epoch < 10:
        return 0.001
    else:
        return 0.001 * (0.1 ** int(epoch/10))
checkpoint = ModelCheckpoint(filepath='resnet_backup.h5',
                             monitor='val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto',
                             period=2)

early = EarlyStopping(monitor='val_acc', min_delta=0, patience=20, verbose=1, mode='auto')
callback = LearningRateScheduler(scheduler)



In [None]:
hist = model.fit_generator(steps_per_epoch=20,generator=mimic_data_generator_from_folder(dfNegBio, batch_size, FOLDER_TO_TRAIN)
                           , validation_data=mimic_data_generator_from_folder(dfNegBio, batch_size, FOLDER_TO_TRAIN)
                           , validation_steps=40,epochs=20,callbacks=[callback, checkpoint, early])


Epoch 1/20




Epoch 2/20





Epoch 00002: saving model to resnet_backup.h5
Epoch 3/20
Epoch 4/20

Epoch 00004: saving model to resnet_backup.h5
Epoch 5/20
Epoch 6/20

Epoch 00006: saving model to resnet_backup.h5
Epoch 7/20
Epoch 8/20

Epoch 00008: saving model to resnet_backup.h5
Epoch 9/20
Epoch 10/20

Epoch 00010: saving model to resnet_backup.h5
Epoch 11/20
Epoch 12/20

Epoch 00012: saving model to resnet_backup.h5
Epoch 13/20
Epoch 14/20

Epoch 00014: saving model to resnet_backup.h5
Epoch 15/20
Epoch 16/20