# Classify Covid19

In [None]:
# common imports
import os
import numpy as np
import datetime
import time
import matplotlib.pylab as plt
import pandas as pd
from glob import glob
from pathlib import Path
from functools import partial
import plotly.express as px

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# prevent VRAM occupied
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# load tensorboard extension
%reload_ext tensorboard

In [None]:
# Constants
SEED =42
# minimum number of cases for each diagnosis/finding label (of 14)
MIN_CASES  = 1000
MIN_CASES_FLAG = True
IMAGE_SIZE = (224,224)
IMAGE_SHAPE = (224,224,3)
BATCH_SIZE = 32
SHUFFLE = True
TARGET_WIDTH= 224
TARGET_HEGIHT =224
NUM_CLASSES = 15 # number of ClassesNUM
NUM_EPOCHS = 20
PRETRAINED_MODELS = ['ResNet50V2', 'MobileNetV2', 'VGG16', 'InceptionV3' ,'DenseNet121'] # pretrained modelspretrained_models = ['ResNet50V2', 'MobileNetV2', 'VGG16', 'InceptionV3'] # pretrained modelspr

log_folder = 'logs' # logs folder


In [None]:
#callback setup
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau

checkpoint_path = 'xray_class_weights.best.hdf5'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=5)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

callbacks = [checkpoint, early, reduce_lr]

In [None]:
nih_xrays_df = pd.read_csv('../data/raw/Data_Entry_2017.csv')

In [None]:
nih_xrays_df.head()

In [None]:
#rename columns
nih_column_names =  {
    'Image Index': 'image_name',
    'Finding Labels' : 'finding_label',
    'Follow-up #' : 'follow_up_num',
    'Patient ID': 'patient_id',
    'Patient Age': 'age',
    'Patient Gender': 'gender',
    'View Position' : 'view_position',
    'OriginalImage[Width': 'image_width',
    'Height]': 'image_height',
    'OriginalImagePixelSpacing[x' : 'x_spacing',
    'y]': 'y_spacing'
}

nih_xrays_df =  nih_xrays_df.rename(columns=nih_column_names)
nih_xrays_df.drop(columns=['Unnamed: 11'], inplace=True)

In [None]:
nih_xrays_df.head()

In [None]:
# create a set of all image paths
image_path = '../data/raw'
all_image_paths = {os.path.basename(x): x for x in glob(os.path.join(image_path, 'images*', '*', '*.png'))}

print('count of raw images paths and rows in NIH dataset :', len(all_image_paths), ', Total Headers', nih_xrays_df.shape[0])

# add image path column to dataframe
nih_xrays_df['path'] = nih_xrays_df['image_name'].map(all_image_paths.get)

# fix data errors - remove patient record with age greater than 100
nih_xrays_df = nih_xrays_df[nih_xrays_df['age']<= 100]

In [None]:
nih_xrays_df.head()


In [None]:
px.strip(nih_xrays_df, x='age', color='finding_label',  hover_name='gender', width= 1200)

In [None]:
#nih_xrays_df[nih_xrays_df['age']>= 100].head(30)

In [None]:
px.histogram(nih_xrays_df, x='age', color='finding_label',  hover_name='gender', width=1000)

In [None]:
label_counts = nih_xrays_df['finding_label'].value_counts()[:15].reset_index()
fig = px.bar(label_counts, x='index', y='finding_label')
fig.show()

In [None]:
# replace the 'No Finding' with '' value = why?
nih_xrays_df['finding_label'] = nih_xrays_df['finding_label'].map(lambda x: x.replace('No Finding', 'NoFinding'))

In [None]:
nih_xrays_df.sample(10)

In [None]:
# Get fourteen unique diagnosis
# It is a function that takes a series of iterables and returns one iterable
# The asterisk "*" is used in Python to define a variable number of arguments. 
# The asterisk character has to precede a variable identifier in the parameter list 
from itertools import chain
all_labels = np.unique(list(chain(*nih_xrays_df['finding_label'].map(lambda x: x.split('|')).tolist())))

In [None]:
all_labels

In [None]:
# remove the empty label
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))

In [None]:
for label in all_labels:
    nih_xrays_df[label]= nih_xrays_df['finding_label'].map(lambda finding: 1.0 if label in finding else 0.0)
    

In [None]:
nih_xrays_df.sample(10)

In [None]:
finding_label_counts ={}
for label in all_labels:
    finding_label_counts[label] = [nih_xrays_df[label].sum()]

In [None]:
finding_label_counts_df = \
    pd.DataFrame.from_dict(finding_label_counts, \
                           orient='index',\
                           columns=['count'])\
                            .reset_index()
type(finding_label_counts_df)                        

In [None]:
fig = px.bar(finding_label_counts_df, x='index', y='count')
fig.show()

In [None]:
 # Apply the min_cases logic

if MIN_CASES_FLAG:
    all_labels_with_min_cases = [label for label in all_labels \
                                     if nih_xrays_df[label].sum() > MIN_CASES]
    print(f'finding labels with min cases: {len(all_labels_with_min_cases)}')  
    print([(label, int(nih_xrays_df[label].sum())) for label in all_labels_with_min_cases])
    
        

In [None]:
print('Number of unique patients:' , nih_xrays_df['patient_id'].nunique())

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split

n=None
group_shuffle_split = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

for train_idx, valid_idx in group_shuffle_split.split(nih_xrays_df[:n], groups=nih_xrays_df[:n]['patient_id'].values):
    train_df = nih_xrays_df.iloc[train_idx]
    valid_df = nih_xrays_df.iloc[valid_idx]


In [None]:
train_df.head()

In [None]:
valid_df.head()

In [None]:
print(f'Number of images in training dataset {train_df.shape[0]}')
print(f'Number of images in validation dataset {valid_df.shape[0]}')

# Data Loading and Data Transformation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
image_data_gen = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=True, #Boolean. Set each sample mean to 0.
    samplewise_std_normalization = True, #Boolean. Divide each input by its std.
    featurewise_std_normalization=False, # divide inputs by std of the dataset
    horizontal_flip = True, #Boolean. Randomly flip inputs horizontally.
    vertical_flip = False,  #Boolean. Randomly flip inputs vertically.
    zca_whitening=False,  # apply ZCA whitening
    height_shift_range= 0.05, #float: fraction of total height, if < 1, or pixels if >= 1.
    width_shift_range=0.1,  #float: fraction of total height, if < 1, or pixels if >= 1.
    rotation_range=20, #Int. Degree range for random rotations. 0 -180 degrees
    shear_range = 0.1, #Float. Shear Intensity (Shear angle in counter-clockwise direction in degrees)
    fill_mode = 'nearest', #One of {"constant", "nearest", "reflect" or "wrap"}. Default is 'nearest'. 
    zoom_range=0.15) #Float or [lower, upper]. Range for random zoom. If a float, [lower, upper] = [1-zoom_range, 1+zoom_range]

In [None]:
#Takes the dataframe and the path to a directory + generates batches.
train_generator = image_data_gen.flow_from_dataframe(
            dataframe=train_df,
            directory=None, #string, path to the directory to read images from. 
                            #If None, data in x_col column should be absolute paths.
            x_col='path', #string, column in dataframe that contains the filenames (or absolute paths if directory is None).
            y_col='finding_label', #string or list, column/s in dataframe that has the target data.
    
            class_mode="categorical", #one of "binary", "categorical", "input", "multi_output", "raw", sparse" or None. Default: "categorical". 
                              # Mode for yielding the targets: "raw": numpy array of values in y_col column(s),
            classes=all_labels,
            #color_mode='grayscale',
            batch_size=BATCH_SIZE,
            shuffle=SHUFFLE,
            seed=SEED,
            target_size= IMAGE_SIZE)

In [None]:
#Takes the dataframe and the path to a directory + generates batches.
valid_generator = image_data_gen.flow_from_dataframe(
            dataframe=valid_df,
            directory=None, #string, path to the directory to read images from. 
                            #If None, data in x_col column should be absolute paths.
            x_col='path', #string, column in dataframe that contains the filenames (or absolute paths if directory is None).
            y_col='finding_label', #string or list, column/s in dataframe that has the target data.
    
            class_mode="categorical", #one of "binary", "categorical", "input", "multi_output", "raw", sparse" or None. Default: "categorical". 
                              # Mode for yielding the targets: "raw": numpy array of values in y_col column(s),
            classes=all_labels,
            #color_mode='grayscale',
            batch_size=BATCH_SIZE,
            shuffle=SHUFFLE,
            seed=SEED,
            target_size= IMAGE_SIZE)

In [None]:
x, y = train_generator.__getitem__(2)
plt.imshow(x[0])
print(y[0])

In [None]:
t_x, t_y = next(train_generator)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone', vmin = -1.5, vmax = 1.5)
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')


In [None]:
def get_base_model(model_name :str = 'resenet50v2', freez_layers:bool = False):
    """ returns pretrained model

    Args:
        model_name (str): model_name values pretrained_models = ['ResNet50V2', 'MobileNetV2', 'VGG16']
    """
    if model_name == 'ResNet50V2' :
        print(f"Downloading ResNet50V2")
        base_model = tf.keras.applications.ResNet50V2(input_shape=IMAGE_SHAPE,
                                               include_top=False,
                                               weights='imagenet')
    elif model_name == 'MobileNetV2' :
        print(f"Downloading MobileNetV2")
        base_model = tf.keras.applications.MobileNetV2(input_shape=IMAGE_SHAPE,
                                               include_top=False,
                                               weights='imagenet')
    elif model_name == 'VGG16' :
        print(f"Downloading VGG16")
        base_model = tf.keras.applications.VGG16(input_shape=IMAGE_SHAPE,
                                               include_top=False,
                                               weights='imagenet')
    elif model_name == 'DenseNet121' :
        print(f"Downloading DenseNet121")
        base_model = tf.keras.applications.DenseNet121(input_shape=None,
                                                      input_tensor=None,
                                               include_top=False,
                                               weights='imagenet')
    if freez_layers == True:
        for layer in base_model.layers:
            layer.trainable = False
            #assert layer.trainable is False
    
    return base_model

In [None]:

def add_classifier_to_base_model(base_model, num_classes: int = NUM_CLASSES):
    """ add a classifier

    Args:
        base_model ([keras.Model]): base_model
        num_classes ([int]) : number classes
    """
    head_model = base_model.output
    head_model = keras.layers.Flatten(name="flatten")(head_model)
    # head_model = keras.layers.Dense(256, activation="relu")(head_model)
    # head_model = keras.layers.Dropout(0.3)(head_model)
    # head_model = keras.layers.Dense(128, activation="relu")(head_model)
    # head_model = keras.layers.Dropout(0.3)(head_model)
    # head_model = keras.layers.Dense(64, activation="relu")(head_model)
    head_model = keras.layers.Dense(num_classes,activation='softmax')(head_model)
    return keras.Model(inputs=base_model.input, outputs=head_model)

In [None]:
def fine_tune_model(model, learning_rate =0.00001, optimizer = 'Adam',  fine_tune_at_layer:int=100):
    # Freeze all the layers before the `fine_tune_at` layer
    for layer in model.layers[fine_tune_at_layer:]:
        layer.trainable =  True
    compile_classifier(model, learning_rate = learning_rate, optimizer=optimizer)
    return model

In [None]:
def compile_classifier(model, learning_rate, optimizer = 'Adam'):
    """[summary]

    Args:
        model ([tensorflow.keras.Model]): classifier model
        learning_rate ([float]): [description]
        optimizer ([tensorflow.keras.optimizers]): optimizer
    """
    if optimizer == 'Adam':
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=learning_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07,
            amsgrad=True,
            name="Adam"
            )

    elif optimizer == 'Adagrad':
        optimizer=tf.keras.optimizers.Adagrad(
            learning_rate=learning_rate,
            initial_accumulator_value=0.1,
            epsilon=1e-07,
            name="Adagrad")
    
    elif optimizer == 'NestrovSGD':
        optimizer=tf.keras.optimizers.SGD(
            learning_rate=0.01, 
            momentum=0.9, 
            nesterov=True, 
            name="SGD")

    
    model.compile(optimizer = optimizer,
              #loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              loss =tf.keras.losses.CategoricalCrossentropy(),
              #loss=get_weighted_loss(pos_weights, neg_weights),
              metrics=['accuracy'])
    
    return model

In [None]:
def fit_model(model,train_ds,
    validation_ds, 
    num_epochs: int = NUM_EPOCHS, batch_size: int = BATCH_SIZE):
    history = model.fit(train_ds,
                    epochs=num_epochs,
                    validation_data=validation_ds,
                    #steps_per_epoch = len(train_ds)/batch_size,#steps_per_epoch = 100, 
                    #validation_steps=len(validation_ds)/batch_size, #validation_steps= 25, 
                    callbacks=callbacks)
    return history

In [None]:
def plot_accuracy_and_loss(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.ylabel('Accuracy')
    plt.ylim([min(plt.ylim()),1])
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.ylabel('Cross Entropy')
    plt.ylim([0,max(plt.ylim())])
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.show()

In [None]:
def train_validate_classifier(model_name, train_generator, valid_generator, learning_rate:float = 0.0001, fine_tune_at_layer:int = 100):
    
    # get base model
    base_model = get_base_model(model_name=model_name,
        freez_layers=True)

    
    model = add_classifier_to_base_model(base_model,
        num_classes = NUM_CLASSES
    )

    model = compile_classifier(model, learning_rate)
    #print(model.summary())
    
    print("\n")
    print(f'{model_name} Training and Validation: ')
    
    history = fit_model(model, train_generator, 
        valid_generator, 
        num_epochs=NUM_EPOCHS)

    

    print(f'{model_name} Accuracy and Loss plots')
    plot_accuracy_and_loss(history)
    
    
    print("\n")
    #fine_tune model_name
    model_ft = fine_tune_model(model,learning_rate,optimizer='Adam',fine_tune_at_layer=fine_tune_at_layer)
    #print(model_ft.summary())

    
    print("\n")
    print(f'Fine-Tuned {model_name} Training and Validation: ')
    history_fine = fit_model(model_ft, train_generator, 
        valid_generator, 
        num_epochs=NUM_EPOCHS)
    print(f'Fine-Tuned {model_name} Accuracy and Loss plots')
    plot_accuracy_and_loss(history_fine)
    
    model_ft.save('my-model')


   # RESENET 50

In [None]:
pretrained_base_model = PRETRAINED_MODELS[0]
learning_rate = 0.01
fine_tune_at_layer = 0
train_validate_classifier(pretrained_base_model,train_generator, valid_generator, learning_rate,fine_tune_at_layer)

# Phase II -  Reload the Saved Model and Train, Validate and test on new Covid19 dataset

In [None]:
## RESETNET50

In [None]:
# Reload the model weights
#pretrained_base_model = PRETRAINED_MODELS[0]
print(os.getcwd())
# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model("my-model")

In [None]:
new_layer = reconstructed_model.layers[-2].output
output = keras.layers.Dense(3, activation='softmax')(new_layer)
new_model = keras.Model(inputs = reconstructed_model.input, outputs = output)

In [None]:
new_model.summary()

In [None]:
BATCH_SIZE =32
train_ds = tf.keras.utils.image_dataset_from_directory(
  '../data/raw/covid19/Covid19-dataset/train',
  validation_split=0.2,
  subset="training",
  label_mode='categorical',
  seed=SEED,
  image_size=IMAGE_SIZE,
  batch_size=BATCH_SIZE)

validation_ds = tf.keras.utils.image_dataset_from_directory(
  '../data/raw/covid19/Covid19-dataset/train/',
  validation_split=0.2,
  subset="validation",
  label_mode='categorical',
  seed=SEED,
  image_size=IMAGE_SIZE,
  batch_size=BATCH_SIZE)

test_ds = tf.keras.utils.image_dataset_from_directory(
  '../data/raw/covid19/Covid19-dataset/test/',
  label_mode='categorical',
  seed=SEED,
  image_size=IMAGE_SIZE,
  batch_size=BATCH_SIZE)

In [None]:
print(f"number of training batches {tf.data.experimental.cardinality(train_ds)}")
print(f"number of validation batches {tf.data.experimental.cardinality(validation_ds)}")
print(f"number of test batches {tf.data.experimental.cardinality(test_ds)}")

In [None]:
def train_validate_test_classifier(model, train_generator, valid_generator, learning_rate:float = 0.0001, fine_tune_at_layer:int = 100):
    
   
    model = compile_classifier(model, learning_rate)
    #print(model.summary())
    
    print("\n")
    print(f'Training and Validation: ')
    
    history = fit_model(model, train_generator, 
        valid_generator, 
        num_epochs=NUM_EPOCHS)

    
    print(f'Accuracy and Loss plots')
    plot_accuracy_and_loss(history)
    
        #evaluate  model
        
    loss, accuracy = model.evaluate(test_ds)
    print(f'Evaluation: ')
    print(f'Test accuracy : {accuracy}')
    print(f'Test loss : {loss}')

    print("\n")
    #fine_tune model_name
    model_ft = fine_tune_model(model,learning_rate,optimizer='Adam',fine_tune_at_layer=fine_tune_at_layer)
    #print(model_ft.summary())

    
    print("\n")
    print(f'Fine-Tuned Training and Validation: ')
    history_fine = fit_model(model_ft, train_generator, 
        valid_generator, 
        num_epochs=NUM_EPOCHS)
    print(f'Fine-Tuned Accuracy and Loss plots')
    plot_accuracy_and_loss(history_fine)
    
    print("\n")

    #evaluate  model
    
    loss_ft, accuracy_ft = model_ft.evaluate(test_ds)
    print(f'Fine-Tuned Evaluation: ')
    print(f'Fine-Tuned Test accuracy : {accuracy_ft}')
    print(f'Fine-Tuned Test loss : {loss_ft}')


In [None]:
fine_tune_at_layer = 178
train_validate_test_classifier(new_model,
                          train_ds, 
                          validation_ds, 
                          learning_rate,
                          fine_tune_at_layer)