In [1]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import os
from keras.preprocessing import image                  
from tqdm import tqdm

In [2]:
df = pd.read_csv('./dataset_information/Data_Entry_2017.csv')

diseases = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation','Infiltration','Fibrosis','Pneumonia']
#Number diseases
for disease in diseases :
    df[disease] = df['Finding Labels'].apply(lambda x: 1 if disease in x else 0)

FileNotFoundError: [Errno 2] No such file or directory: '../input/data/Data_Entry_2017.csv'

In [None]:
all_image_paths = {os.path.basename(x): x for x in glob(os.path.join('.' ,'images', '*.png'))}
print('Images found:', len(all_image_paths))
df['Path'] = df['Image Index'].map(all_image_paths.get)

In [None]:
labels = df[diseases].to_numpy()
files_list = df['Path'].tolist()

In [None]:
labelB = (df[diseases].sum(axis=1)>0).tolist()
labelB = np.array(labelB, dtype=int)
print(labelB)

In [None]:
train_df, test_df = train_test_split(df,test_size = 0.33,random_state = 2018)
print(f'Training : {train_df.shape[0]}  Testing: {test_df.shape[0]}')

In [None]:
patient_train = set(train_df["Patient ID"].values)
patient_test  = set(test_df["Patient ID"].values)

leakage_patient = patient_train.intersection(patient_test)
print(len(leakage_patient))

In [None]:
train_without_leakage = train_df[~train_df["Patient ID"].isin(leakage_patient)]

print("Train")
print(len(train_without_leakage) / len(train_df))
print(len(train_df))
print(len(train_without_leakage))

print()
print("Test")
test_without_leakage = test_df[~test_df["Patient ID"].isin(leakage_patient)]
print(len(test_without_leakage) / len(test_df))
print(len(test_df))
print(len(test_without_leakage))

In [None]:
test_df = test_without_leakage

In [None]:
train_df, valid_df = train_test_split(train_df, test_size = 0.2, random_state = 42)

In [None]:
def path_to_tensor(img_path, shape):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=shape)
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)/255
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [None]:
img_shape = (64, 64)
train_tensors = paths_to_tensor(train_df['Path'].to_list(), shape = img_shape)
valid_tensors = paths_to_tensor(valid_df['Path'].to_list(), shape = img_shape)
test_tensors = paths_to_tensor(test_df['Path'].to_list(), shape = img_shape)

In [None]:
train_labels = (train_df[diseases].sum(axis=1)>0).tolist()
train_labels = np.array(train_labels, dtype=int)
train_labels = train_labels[:, np.newaxis]

In [None]:
train_labels

In [None]:
valid_labels = (valid_df[diseases].sum(axis=1)>0).tolist()
valid_labels = np.array(valid_labels, dtype=int)
valid_labels =valid_labels[:, np.newaxis]

In [None]:
test_labels = (test_df[diseases].sum(axis=1)>0).tolist()
test_labels = np.array(test_labels, dtype=int)
test_labels =test_labels[:, np.newaxis]

In [None]:
from keras import backend as K

def binary_accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.round(y_pred)))

def precision_threshold(threshold = 0.5):
    def precision(y_true, y_pred):
        threshold_value = threshold
        y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold_value), K.floatx())
        true_positives = K.round(K.sum(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(y_pred)
        precision_ratio = true_positives / (predicted_positives + K.epsilon())
        return precision_ratio
    return precision

In [None]:
def recall_threshold(threshold = 0.5):
    def recall(y_true, y_pred):
        threshold_value = threshold
        y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold_value), K.floatx())
        true_positives = K.round(K.sum(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.clip(y_true, 0, 1))
        recall_ratio = true_positives / (possible_positives + K.epsilon())
        return recall_ratio
    return recall

In [None]:
def fbeta_score_threshold(beta = 1, threshold = 0.5):
    def fbeta_score(y_true, y_pred):
        threshold_value = threshold
        beta_value = beta
        p = precision_threshold(threshold_value)(y_true, y_pred)
        r = recall_threshold(threshold_value)(y_true, y_pred)
        bb = beta_value ** 2
        fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
        return fbeta_score
    return fbeta_score

In [None]:
import time

from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, Flatten, Dense
from keras.models import Sequential, Model
from keras.layers import BatchNormalization
from keras import regularizers, applications, optimizers, initializers
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16

In [None]:
base_model = VGG16(weights='imagenet', include_top=False, input_shape=train_tensors.shape[1:])

add_model = Sequential()
add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
add_model.add(Dropout(0.2))
add_model.add(Dense(256, activation='relu'))
add_model.add(Dropout(0.2))

add_model.add(Dense(256, activation='relu'))
add_model.add(Dropout(0.2))

add_model.add(Dense(50, activation='relu'))
add_model.add(Dropout(0.2))
add_model.add(Dense(1, activation='softmax'))

model = Model(inputs=base_model.input, outputs=add_model(base_model.output))

# model.summary()
# add_model.summary()

In [None]:
import keras.backend as K
from tensorflow.keras import optimizers

model.compile(
    optimizer = optimizers.SGD(
        learning_rate=1e-4, 
        decay=1e-6, 
        momentum=0.9, 
        nesterov=True), 
    loss = 'binary_crossentropy', 
    metrics = [
        'accuracy',
        precision_threshold(threshold = 0.5), 
        recall_threshold(threshold = 0.5), 
        fbeta_score_threshold(beta=0.5, threshold = 0.5)
])

In [None]:
%%timeit -n1 -r1

from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
import numpy as np

Epochs = 10
batch_size = 32

earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
log = CSVLogger('saved_models/log_pretrained_CNN.csv')
checkpointer = ModelCheckpoint(filepath='saved_models/pretrainedVGG.best.from_scratch.hdf5', verbose=1, save_best_only=True)

train_datagen = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
    width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=True,  # randomly flip images
    vertical_flip=False 
)

# Training with data augmentation. If shift_fraction=0., also no augmentation.
history = model.fit_generator(
    train_datagen.flow(train_tensors,train_labels, batch_size = batch_size),
    steps_per_epoch = len(train_tensors) // batch_size,
    validation_data = (valid_tensors, valid_labels),
    validation_steps = len(valid_tensors) // batch_size,
    epochs = Epochs
)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

prediction = model.predict(test_tensors)

threshold = 0.5
beta = 0.5
pre = K.eval(precision_threshold(threshold = threshold)(K.variable(value=test_labels), K.variable(value=prediction)))
rec = K.eval(recall_threshold(threshold = threshold)(K.variable(value=test_labels), K.variable(value=prediction)))
fsc = K.eval(fbeta_score_threshold(beta = beta, threshold = threshold)(K.variable(value=test_labels),K.variable(value=prediction)))

print ("Precision: %f %%\nRecall: %f %%\nFscore: %f %%"% (pre, rec, fsc))

In [None]:
K.eval(binary_accuracy(K.variable(value = test_labels), K.variable(value = prediction)))

In [None]:
roc_auc_score(test_labels, prediction)

In [None]:
# model.save('vgg16-model.h5')
# model.save_weights('vgg16-w-model.h5')