## Model Variables

In [3]:
IMAGE_SIZE = 224
BATCH_SIZE = 32
DATASET_SIZE = 1500
MAX_EPOCHS = 20
K_FOLDS = 5
DATASET_PATH = 'covidx-cxr2'

## Data Preparation

In [4]:
# import required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
from sklearn.metrics import confusion_matrix, classification_report
from datetime import datetime
from packaging import version
%reload_ext tensorboard
import tensorboard
from sklearn.model_selection import StratifiedKFold
from tensorboard.plugins.hparams import api as hp
import time

# tensorflow-gpu check
print("Tensorflow Version:", tf.__version__)
print("Tensorboard Version:", tensorboard.__version__)
print("GPU Found:", tf.test.is_gpu_available())

Tensorflow Version: 2.8.0
Tensorboard Version: 2.8.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Found: True


In [5]:
# set up hyper-parameters
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([128]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam']))
HP_L_RATE= hp.HParam('learning_rate', hp.Discrete([0.001]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('trained-classifiers/logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER, HP_L_RATE],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
    )

In [6]:
# read in train data
train_df = pd.read_csv(DATASET_PATH+'/train_COVIDx9B.txt', 
                       sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
train_df.columns=['patient id', 'filename', 'class', 'data source']

# drop patient id and datasource as not needed
train_df=train_df.drop(['patient id', 'data source'], axis=1 )

# read in test data
test_df = pd.read_csv(DATASET_PATH+'/test_COVIDx9B.txt', 
                      sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
test_df.columns=['patient id', 'filename', 'class', 'data source']

# drop patient id and datasource as not needed
test_df=test_df.drop(['patient id', 'data source'], axis=1 )

In [7]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    16490
negative    13992
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [8]:
negative  = train_df[train_df['class']=='negative']   # normal values in class column
positive = train_df[train_df['class']=='positive']  # COVID-19 values in class column

from sklearn.utils import resample
# downsample training data to equal values of each class, to reduce class bias and reduce training time

df_negative_downsampled = resample(negative, replace = True, n_samples = DATASET_SIZE//2)
df_positive_downsampled = resample(positive, replace = True, n_samples = DATASET_SIZE//2) 

#concatenate
train_df = pd.concat([df_negative_downsampled, df_positive_downsampled])

from sklearn.utils import shuffle
train_df = shuffle(train_df) # shuffling so that there is no particular sequence
print("Train class counts:")
print(train_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64


In [9]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [10]:
# preprocess images
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.xception.preprocess_input)

test_gen = test_datagen.flow_from_dataframe(dataframe = test_df, directory=DATASET_PATH+'/test', x_col='filename',
                                            y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), batch_size=BATCH_SIZE, 
                                            color_mode='rgb', class_mode='binary', shuffle=False)

Found 400 validated image filenames belonging to 2 classes.


In [11]:
def preprocess_images_cv(train_index, val_index):
    # find the section of the training data that the fold is located
    training_data = train_df.iloc[train_index]
    validation_data = train_df.iloc[val_index]
    train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.xception.preprocess_input,
                                       rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, 
                                       shear_range = 0.2, zoom_range = 0.1, horizontal_flip = True, 
                                       vertical_flip = True)

    # now get the images from directory with augmentation
    
    train_gen = train_datagen.flow_from_dataframe(dataframe = training_data, directory=DATASET_PATH+'/train', 
                                                  x_col='filename', y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), 
                                                  batch_size=BATCH_SIZE, color_mode='rgb', class_mode='binary')
    valid_gen = test_datagen.flow_from_dataframe(dataframe = validation_data, directory=DATASET_PATH+'/train', 
                                                 x_col='filename', y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), 
                                                 batch_size=BATCH_SIZE,  color_mode='rgb', class_mode='binary')
    return train_gen, valid_gen

## Xception

In [12]:
# required libraries for Xception
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [13]:
def create_model(hparams):
    # create the base pre-trained model
    base_model = keras.applications.Xception(weights='imagenet', input_shape = (IMAGE_SIZE,IMAGE_SIZE,3),
                                                  include_top=False)
    # add some minor imtpovements to prevent overfitting and add output layer 
    model = keras.Sequential([base_model,
                              keras.layers.Conv2D(1024,1, padding='same'),
                              keras.layers.Flatten(),
                              keras.layers.Dropout(hparams[HP_DROPOUT]),
                              keras.layers.Dense(hparams[HP_NUM_UNITS], activation='relu',
                                                 kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),
                                                 bias_regularizer=regularizers.l2(1e-4),
                                                 activity_regularizer=regularizers.l2(1e-5)
                                                 ),
                              keras.layers.Dense(1, activation='sigmoid')
                              ])
    # compile model
    optimizer_name = hparams[HP_OPTIMIZER]
    learning_rate = hparams[HP_L_RATE]
    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        raise ValueError("unexpected optimizer name: %r" % (optimizer_name,))


    model.compile(optimizer = optimizer,
                  loss = 'binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [14]:
def get_model_name(k):
    return str("Xception-v5-Binary-" + "Fold-"+str(k)+"-"+datetime.now().strftime("%Y%m%d-%H%M%S"))

In [15]:
# perform HP tuning and CV on the model on the data
Y = train_df['class']
trained_models = []

# iterate through hyper-parameters
for num_units in HP_NUM_UNITS.domain.values:
    for dropout_rate in HP_DROPOUT.domain.values:
        for optimizer in HP_OPTIMIZER.domain.values:
            for learning_rate in HP_L_RATE.domain.values:
                # instantiate k folds and hyper-parameters for this iteration
                skf = StratifiedKFold(n_splits = K_FOLDS, shuffle = True) 
                k_i = 0
                k_fold_models = []
                elapsed_times = []
                hparams = {
                    HP_NUM_UNITS: num_units,
                    HP_DROPOUT: dropout_rate,
                    HP_OPTIMIZER: optimizer,
                    HP_L_RATE: learning_rate
                }
                print('HPARAMS:',{h.name: hparams[h] for h in hparams})
                
                # iterate through k folds
                for train_index, val_index in skf.split(np.zeros(len(Y)),Y):
                    print('--- Starting trial: %s' % k_i)
                    t = time.process_time() # begin timing model
                    train_gen, valid_gen = preprocess_images_cv(train_index, val_index) # fetch dataset for this fold
                    
                    # set up logging directories and callbacks
                    logdir = os.path.join('trained-classifiers','logs','fit', get_model_name(k_i))
                    hpdir = os.path.join('trained-classifiers','logs','hparam_tuning', get_model_name(k_i))
                    modeldir = os.path.join('model-checkpoints',str(get_model_name(k_i)+".h5"))
                    callbacks = [
                        keras.callbacks.ModelCheckpoint(filepath=modeldir, save_best_only=True, verbose = 0),
                        keras.callbacks.EarlyStopping(patience=3, monitor='val_loss', verbose=1, restore_best_weights=True),
                        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1),
                        keras.callbacks.TensorBoard(log_dir=logdir)
                      ]
                    
                    # create the model
                    hp.hparams(hparams)  
                    model = create_model(hparams)
                    
                    # train model
                    model.fit(train_gen, validation_data=valid_gen, epochs=MAX_EPOCHS, callbacks=callbacks)
                    
                    # record the values used in this trial
                    elapsed_times.append(time.process_time() - t)
                    k_fold_models.append(modeldir)
                    
                    # clean memory
                    del model, train_gen, valid_gen
                    keras.backend.clear_session()
                    k_i += 1
                    
                trained_models.append([hparams, k_fold_models, elapsed_times])

HPARAMS: {'num_units': 128, 'dropout': 0.1, 'optimizer': 'adam', 'learning_rate': 0.001}
--- Starting trial: 0
Found 1200 validated image filenames belonging to 2 classes.
Found 300 validated image filenames belonging to 2 classes.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/20
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 15: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 18: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 19/20
Epoch 19: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 20/20

Epoch 20: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.


## Evaluation

In [16]:
# import libraries for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from statistics import mean

In [17]:
# iterate through hyper-parameter configurations
for j in trained_models:
    model = create_model(j[0])
    accuracies = []
    k_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    tp = []
    fp = []
    tn = []
    fn = []
    
    # iterate through each folds model
    for i in j[1]:
        model.load_weights(i)
        test_pred = model.predict(test_gen)
        y_pred = np.rint(test_pred).flatten()
        y_test= [test_gen.class_indices[k] for k in test_df['class'].values.tolist()]
        k_accuracies.append(model.evaluate(test_gen)[1])
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        tn_x, fp_x, fn_x, tp_x = confusion_matrix(y_test, y_pred).ravel()
        tp.append(tp_x)
        fp.append(fp_x)
        tn.append(tn_x)
        fn.append(fn_x)

    print("SCORE FOR HPARAMS:", {h.name: j[0][h] for h in j[0]})
    print("Mean accuracy:", mean(accuracies))
    print("Mean k_accuracy;", mean(k_accuracies))
    print("Mean precision:", mean(precisions))
    print("Mean recall:", mean(recalls))
    print("Mean f1:", mean(f1s))
    print("Mean training time:", mean(j[2]))
    
    # retreive hyper-parameters
    hdict = {h.name: j[0][h] for h in j[0]}
    
    # save results to file
    df =  pd.DataFrame(np.array([[str(j[1]),
                                  hdict.get('num_units'), hdict.get('dropout'), hdict.get('optimizer'), 
                                  hdict.get('learning_rate'),IMAGE_SIZE, BATCH_SIZE, DATASET_SIZE, MAX_EPOCHS, K_FOLDS,
                                  mean(accuracies), mean(k_accuracies), mean(precisions), mean(recalls), mean(f1s),
                                  mean(tn), mean(fp), mean(fn), mean(tp), mean(j[2])
                                 ]]),
                       columns=['Model Names','num_units', 'dropout','optimizer','learning_rate',
                                'IMAGE_SIZE', 'BATCH_SIZE', 'DATASET_SIZE', 'MAX_EPOCHS',
                                'K_FOLDS','Accuracy', 'K_accuracy', 'Precision', 'Recall', 'f1 Score', 
                                'True Negatives', ' False Positives', 'False Negatives', 'True Positives', 'Training Time'])
    with open('trained-classifiers/Xception.csv', 'a') as f:
        df.to_csv(f, mode='a', header=f.tell()==0, index = False)

SCORE FOR HPARAMS: {'num_units': 128, 'dropout': 0.1, 'optimizer': 'adam', 'learning_rate': 0.001}
Mean accuracy: 0.6715
Mean k_accuracy; 0.6714999914169312
Mean precision: 0.6826310288188742
Mean recall: 0.559
Mean f1: 0.5161361528373547
Mean training time: 922.4690477818


NOTE: Accuracy and loss graphs can be seen on TensorBoard, run "%tensorboard --logdir logs --host 0.0.0.0" to open