In [11]:
#%%timeit -n1 -r1

In [1]:
import os
import albumentations
import cv2

import gc as Garbase_Collector
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras import layers
from tensorflow.keras import models
from keras_preprocessing.image import ImageDataGenerator

In [4]:
dataset_info = pd.read_csv('./dataset_information/image_path_with_labels.csv')
dataset_info.drop(columns = ["Finding Labels"], axis = 1)

df_train, df_test = train_test_split(dataset_info, test_size = 0.20, random_state = 42)
# df_train, df_test = train_test_split(dataset_info.sample(n = 50000), test_size = 0.20, random_state = 42)
df_train, df_val = train_test_split(df_train, test_size = 0.10, random_state = 42)

print(f"Training Samples: {len(df_train)}\nValidation Samples: {len(df_val)}\nTesting Samples: {len(df_test)}")

Training Samples: 80726
Validation Samples: 8970
Testing Samples: 22424


In [5]:
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 3
CLASS_LABELS = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Effusion", "Emphysema", "Fibrosis", "Hernia", 
    "Infiltration", "Mass", "No Finding", "Nodule", "Pleural_Thickening", "Pneumonia","Pneumothorax"
]

In [6]:
def flow_from_dataframe(image_generator, dataframe):
    df_gen = image_generator.flow_from_dataframe(
        dataframe,
        x_col = "img_path", 
        y_col = dataframe.columns[4:],
        target_size = IMAGE_SIZE,
        color_mode = 'rgb',
        class_mode = 'raw',
        shuffle = False,
        batch_size = BATCH_SIZE
    )
    return df_gen

In [10]:
# data_generator = ImageDataGenerator(
#     featurewise_center = False,  # set input mean to 0 over the dataset
#     samplewise_center = False,  # set each sample mean to 0
#     featurewise_std_normalization = False,  # divide inputs by std of the dataset
#     samplewise_std_normalization = False,  # divide each input by its std
#     zca_whitening = False,  # apply ZCA whitening
#     rotation_range = 30,  # randomly rotate images in the range (degrees, 0 to 180)
#     zoom_range = 0.2, # Randomly zoom image 
#     width_shift_range = 0.1,  # randomly shift images horizontally (fraction of total width)
#     height_shift_range = 0.1,  # randomly shift images vertically (fraction of total height)
#     horizontal_flip = True,  # randomly flip images
#     vertical_flip = False  # randomly flip images
# )

data_generator = ImageDataGenerator(rescale = 1. / 255)

train_gen = flow_from_dataframe(
    image_generator = data_generator,
    dataframe = df_train
)
valid_gen = flow_from_dataframe(
    image_generator = data_generator,
    dataframe = df_val    
)
test_gen = flow_from_dataframe(
    image_generator = data_generator,
    dataframe = df_test
)

Found 80726 validated image filenames.
Found 8970 validated image filenames.
Found 22424 validated image filenames.


In [11]:
train_x, train_y = train_gen.next()

print(f"Dimension of Image # 1: {train_x[0].shape}")
print(f"Labels of Image # 1: {train_y[0]}")

Dimension of Image # 1: (224, 224, 3)
Labels of Image # 1: [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


In [12]:
from keras.layers import Input
from keras.applications.resnet_v2 import ResNet50V2
from keras.layers.core import Dense
from keras.models import Model

input_shape = (224, 224, 3)
img_input = Input(shape = input_shape)

base_model = ResNet50V2(
    include_top = False, 
    input_tensor = img_input, 
    input_shape = input_shape, 
    pooling = "avg", 
    weights = 'imagenet'
)

x = base_model.output
predictions = Dense(15, activation = "sigmoid", name = "predictions")(x)
model = Model(inputs = img_input, outputs = predictions)

In [69]:
# model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_7[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
pool1_pad (ZeroPadding2D)       (None, 114, 114, 64) 0           conv1_conv[0][0]                 
____________________________________________________________________________________________

In [252]:
import keras.backend as kb
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
import shutil
import warnings
import json

class MultipleClassAUROC(Callback):
    """
    Monitor mean AUROC and update model
    """
    def __init__(self, generator, class_names, weights_path, stats=None):
        super(Callback, self).__init__()
        self.generator = generator
        self.class_names = class_names
        self.weights_path = weights_path
        self.best_weights_path = os.path.join(
            os.path.split(weights_path)[0],
            f"best_{os.path.split(weights_path)[1]}",
        )
        self.best_auroc_log_path = os.path.join(
            os.path.split(weights_path)[0],
            "best_auroc.log",
        )
        self.stats_output_path = os.path.join(
            os.path.split(weights_path)[0],
            ".training_stats.json"
        )
        # for resuming previous training
        if stats:
            self.stats = stats
        else:
            self.stats = {"best_mean_auroc": 0}

        # aurocs log
        self.aurocs = {}
        for c in self.class_names:
            self.aurocs[c] = []

    def on_epoch_end(self, epoch, logs={}):
        """
        Calcula el promedio de las Curvas ROC y guarda el mejor grupo de pesos
        de acuerdo a esta metrica
        """
        print("\n*********************************")
        self.stats["lr"] = float(kb.eval(self.model.optimizer.lr))
        print(f"Learning Rate actual: {self.stats['lr']}")

        """
        y_hat shape: (#ejemplos, len(etiquetas))
        y: [(#ejemplos, 1), (#ejemplos, 1) ... (#ejemplos, 1)]
        """
        y_hat = self.model.predict_generator(self.generator,steps=self.generator.n/self.generator.batch_size)
        y = self.generator.labels

        print(f"*** epoch#{epoch + 1} Curvas ROC Fase Entrenamiento ***")
        current_auroc = []
        for i in range(len(self.class_names)):
            try:
                score = roc_auc_score(y[:, i], y_hat[:, i])
            except ValueError:
                score = 0
            self.aurocs[self.class_names[i]].append(score)
            current_auroc.append(score)
            print(f"{i+1}. {self.class_names[i]}: {score}")
        print("*********************************")

        mean_auroc = np.mean(current_auroc)
        print(f"Promedio Curvas ROC: {mean_auroc}")
        if mean_auroc > self.stats["best_mean_auroc"]:
            print(f"Actualización del resultado de las Curvas de ROC de: {self.stats['best_mean_auroc']} a {mean_auroc}")

            # 1. copy best model
            shutil.copy(self.weights_path, self.best_weights_path)

            # 2. update log file
            print(f"Actualización del archivo de logs: {self.best_auroc_log_path}")
            with open(self.best_auroc_log_path, "a") as f:
                f.write(f"(epoch#{epoch + 1}) auroc: {mean_auroc}, lr: {self.stats['lr']}\n")

            # 3. write stats output, this is used for resuming the training
            with open(self.stats_output_path, 'w') as f:
                json.dump(self.stats, f)

            print(f"Actualización del grupo de pesos: {self.weights_path} -> {self.best_weights_path}")
            self.stats["best_mean_auroc"] = mean_auroc
            print("*********************************")
        return

In [None]:
from keras import backend as K

def binary_accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.round(y_pred)))

def precision_threshold(threshold = 0.5):
    def precision(y_true, y_pred):
        threshold_value = threshold
        y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold_value), K.floatx())
        true_positives = K.round(K.sum(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(y_pred)
        precision_ratio = true_positives / (predicted_positives + K.epsilon())
        return precision_ratio
    return precision

In [None]:
def recall_threshold(threshold = 0.5):
    def recall(y_true, y_pred):
        threshold_value = threshold
        y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold_value), K.floatx())
        true_positives = K.round(K.sum(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.clip(y_true, 0, 1))
        recall_ratio = true_positives / (possible_positives + K.epsilon())
        return recall_ratio
    return recall

In [None]:
def fbeta_score_threshold(beta = 1, threshold = 0.5):
    def fbeta_score(y_true, y_pred):
        threshold_value = threshold
        beta_value = beta
        p = precision_threshold(threshold_value)(y_true, y_pred)
        r = recall_threshold(threshold_value)(y_true, y_pred)
        bb = beta_value ** 2
        fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
        return fbeta_score
    return fbeta_score

In [253]:
from keras.callbacks import ModelCheckpoint

model_train = model
output_weights_name='weights.h5'

checkpoint = ModelCheckpoint(
    output_weights_name,
    save_weights_only=True,
    save_best_only=True,
    verbose=1,
)

In [254]:
training_stats = {}

auroc = MultipleClassAUROC(
    generator = valid_gen,
    class_names = CLASS_LABELS,
    weights_path = output_weights_name,
    stats = training_stats
)

In [255]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

initial_learning_rate = 1e-3
optimizer = Adam(learning_rate = initial_learning_rate)
model_train.compile(
    optimizer = optimizer, 
    loss = "binary_crossentropy", 
    metrics = [
        'acc', 
        tf.keras.metrics.FalseNegatives(),
        precision_threshold(threshold = 0.5), 
        recall_threshold(threshold = 0.5), 
        fbeta_score_threshold(beta=0.5, threshold = 0.5)
    ]
)

In [256]:
from keras.callbacks import TensorBoard, ReduceLROnPlateau

patience_reduce_lr = 2
min_lr = 1e-8
callbacks = [
    checkpoint,
    ReduceLROnPlateau(
        monitor = 'val_loss', 
        factor = 0.1, 
        patience = patience_reduce_lr, 
        verbose = 1, 
        mode = "min", 
        min_lr = min_lr
    ),
    auroc,
]

In [None]:
# history = model.fit(
#     x = train_x,
#     y = train_y,
# #     batch_size = BATCH_SIZE,
#     epochs = EPOCHS,
#     callbacks = callbacks,
#     validation_data = valid_gen,
#     shuffle = True,
#     workers = 6,
#     use_multiprocessing = True,
# )

In [257]:
fit_history = model.fit_generator(
    generator = train_gen,
    steps_per_epoch = train_gen.n / train_gen.batch_size,
    epochs = EPOCHS,
    validation_data = valid_gen,
    validation_steps = valid_gen.n / valid_gen.batch_size,
    callbacks = callbacks,
    shuffle = False
)



Epoch 1/10
 382/2522 [===>..........................] - ETA: 27:01 - loss: 0.2112 - acc: 0.5298 - false_negatives_3: 10720.0000

KeyboardInterrupt: 

In [22]:
# np.warnings.filterwarnings('ignore', category = np.VisibleDeprecationWarning)

# def get_data(df, img_size):
#     data = []
    
#     paths = df.Image_Path.values
#     labels = df.Binary_Image_Labels.values
    
#     for path, label in tqdm(zip(paths, labels), desc = "Progress"):
#         img = cv2.imread(path)
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#         img = cv2.resize(img, img_size)
#         data.append([img, label])
        
#     return np.array(data, dtype = object)

In [17]:
# train_data = get_data(df_train, IMAGE_SIZE)
# del df_train
# val_data = get_data(df_val, IMAGE_SIZE)
# del df_val
# test_data = get_data(df_test, IMAGE_SIZE)
# del df_test

Progress: 36000it [09:52, 60.73it/s]
Progress: 4000it [01:11, 55.90it/s]
Progress: 10000it [02:43, 61.26it/s]


In [18]:
# x_train, y_train, x_val, y_val = [], [], [], []

# for feature, label in train_data:
#     x_train.append(feature)
#     y_train.append(label)

# del train_data

# for feature, label in val_data:
#     x_val.append(feature)
#     y_val.append(label)

# del val_data

In [19]:
# Garbase_Collector.collect()

225

In [20]:
# Normalize the data
# img_size = IMAGE_SIZE[0]
# x_train = np.array(x_train) / 255
# x_train.reshape(-1, img_size, img_size, 1)

# x_val = np.array(x_val) / 255
# x_val.reshape(-1, img_size, img_size, 1)

# y_train = np.array(y_train)
# y_val = np.array(y_val)

In [21]:
# from keras.preprocessing.image import ImageDataGenerator

# data_generator = ImageDataGenerator(
#     featurewise_center = False,  # set input mean to 0 over the dataset
#     samplewise_center = False,  # set each sample mean to 0
#     featurewise_std_normalization = False,  # divide inputs by std of the dataset
#     samplewise_std_normalization = False,  # divide each input by its std
#     zca_whitening = False,  # apply ZCA whitening
#     rotation_range = 30,  # randomly rotate images in the range (degrees, 0 to 180)
#     zoom_range = 0.2, # Randomly zoom image 
#     width_shift_range = 0.1,  # randomly shift images horizontally (fraction of total width)
#     height_shift_range = 0.1,  # randomly shift images vertically (fraction of total height)
#     horizontal_flip = True,  # randomly flip images
#     vertical_flip = False  # randomly flip images
# )

# data_generator.fit(x_train)

MemoryError: Unable to allocate 6.59 GiB for an array with shape (36000, 128, 128, 3) and data type float32

In [None]:
# from keras.applications.mobilenet import MobileNet
# from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
# from keras.models import Sequential
# from tensorflow.keras.applications import MobileNetV2

In [None]:
# mobile_net_v2 = MobileNetV2(input_shape =  (128, 128, 3), include_top = False, weights = "imagenet")

# model = Sequential()
# model.add(mobile_net_v2)
# model.add(GlobalAveragePooling2D())
# model.add(Dropout(0.5))
# model.add(Dense(512))
# model.add(Dropout(0.5))
# model.add(Dense(15, activation = 'sigmoid'))

# model.compile(
#     optimizer = 'adam', 
#     loss = 'binary_crossentropy',
#     metrics = [
#         'accuracy', 
#         'binary_accuracy', 
#         'mae', 
# #         precision_threshold(threshold = 0.5), 
# #         recall_threshold(threshold = 0.5), 
# #         fbeta_score_threshold(beta=0.5, threshold = 0.5)
#     ]
# )

# model.summary()

In [None]:
# history = model.fit(x_train,y_train, epochs = 10 , validation_data = (x_val, y_val))

In [None]:
# opt = Adam(lr=0.000001)
# model.compile(optimizer = opt , loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) , metrics = ['accuracy'])

In [None]:
# history = model.fit_generator(x_train,steps_per_epoch = 100, validation_data = (x_val,y_val), epochs = 10)