## Dataset Download

In [1]:
#!gsutil -m cp -r gs://covidx-bucket/covidx-cxr2/ ./

## Model Variables

In [2]:
IMAGE_SIZE = 224
BATCH_SIZE = 32
DATASET_SIZE = 1500
MAX_EPOCHS = 20
K_FOLDS = 5

## Data Preparation

In [3]:
# import required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
from sklearn.metrics import confusion_matrix, classification_report
from datetime import datetime
from packaging import version
%reload_ext tensorboard
import tensorboard
from sklearn.model_selection import StratifiedKFold
from tensorboard.plugins.hparams import api as hp
import time
import gc

# enable dynamic memory allocation
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

# tensorflow-gpu check
print("Tensorflow Version:", tf.__version__)
print("Tensorboard Version:", tensorboard.__version__)
print("GPU Found:", tf.test.is_gpu_available())

Tensorflow Version: 1.15.5
Tensorboard Version: 1.15.0
GPU Found: True


In [4]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([128]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))
HP_L_RATE= hp.HParam('learning_rate', hp.Discrete([0.001]))

METRIC_ACCURACY = 'accuracy'

# for tf2 use tf.summary.create_file_writer().as_default()

with tf.compat.v1.summary.FileWriter('trained classifiers/logs/hparam_tuning'):
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER, HP_L_RATE],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
    )

In [5]:
# read in train data
train_df = pd.read_csv('covidx-cxr2/train_COVIDx9B.txt', 
                       sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
train_df.columns=['patient id', 'filename', 'class', 'data source']
# drop patient id and datasource as not needed
train_df=train_df.drop(['patient id', 'data source'], axis=1 )

# read in test data
test_df = pd.read_csv('covidx-cxr2/test_COVIDx9B.txt', 
                      sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
test_df.columns=['patient id', 'filename', 'class', 'data source']
# drop patient id and datasource as not needed
test_df=test_df.drop(['patient id', 'data source'], axis=1 )

In [6]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    16490
negative    13992
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [7]:
negative  = train_df[train_df['class']=='negative']   # normal values in class column
positive = train_df[train_df['class']=='positive']  # COVID-19 values in class column

from sklearn.utils import resample
# downsample training data to 400 values of each class, to reduce class bias and reduce training time

df_negative_downsampled = resample(negative, replace = True, n_samples = DATASET_SIZE//2)
df_positive_downsampled = resample(positive, replace = True, n_samples = DATASET_SIZE//2) 

#concatenate
train_df = pd.concat([df_negative_downsampled, df_positive_downsampled])

from sklearn.utils import shuffle
train_df = shuffle(train_df) # shuffling so that there is particular sequence
print("Train class counts:")
print(train_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64


In [8]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [9]:
# preprocess images
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.xception.preprocess_input)

test_gen = test_datagen.flow_from_dataframe(dataframe = test_df, directory="covidx-cxr2/test", x_col='filename', 
                                            y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), batch_size=BATCH_SIZE, 
                                            color_mode='rgb', class_mode='binary', shuffle=False)

Found 400 validated image filenames belonging to 2 classes.


In [10]:
def preprocess_images_cv(train_index, val_index):
    training_data = train_df.iloc[train_index]
    validation_data = train_df.iloc[val_index]
    train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.xception.preprocess_input,
                                       rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, 
                                       shear_range = 0.2, zoom_range = 0.1, horizontal_flip = True, 
                                       vertical_flip = True)

    #Now fit the them to get the images from directory (name of the images are given in dataframe) with augmentation

    train_gen = train_datagen.flow_from_dataframe(dataframe = training_data, directory="covidx-cxr2/train", x_col='filename', 
                                                  y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), batch_size=BATCH_SIZE, 
                                                  color_mode='rgb', class_mode='binary')
    valid_gen = test_datagen.flow_from_dataframe(dataframe = validation_data, directory="covidx-cxr2/train", x_col='filename',
                                                 y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), batch_size=BATCH_SIZE, 
                                                 color_mode='rgb', class_mode='binary')
    return train_gen, valid_gen


## Xception

In [11]:
# required libraries for ResNet
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [12]:
def create_model(hparams):
    # create the base pre-trained model
    base_model = keras.applications.Xception(weights='imagenet', input_shape = (IMAGE_SIZE,IMAGE_SIZE,3),
                                                  include_top=False)
        
    model = keras.Sequential([base_model,
                              keras.layers.GlobalAveragePooling2D(),
                              keras.layers.Dense(hparams[HP_NUM_UNITS], activation='relu',
                                                 kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),
                                                 bias_regularizer=regularizers.l2(1e-4),
                                                 activity_regularizer=regularizers.l2(1e-5)),
                              keras.layers.BatchNormalization(),
                              keras.layers.Dropout(hparams[HP_DROPOUT]),
                              keras.layers.Dense(1, activation='sigmoid')
                             ])
    
    optimizer_name = hparams[HP_OPTIMIZER]
    learning_rate = hparams[HP_L_RATE]
    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True)
    else:
        raise ValueError("unexpected optimizer name: %r" % (optimizer_name,))

    model.compile(optimizer = optimizer,
                  loss = 'binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [13]:
def get_model_name(k):
    return str("Xception-Binary-" + "Fold-"+str(k)+"-"+datetime.now().strftime("%Y%m%d-%H%M%S"))

In [14]:
# perform HP tuning and CV on the model on the data
Y = train_df['class']
trained_models = []

# clear resources before beginning training
#keras.backend.clear_session()
gc.collect()

for num_units in HP_NUM_UNITS.domain.values:
    for dropout_rate in HP_DROPOUT.domain.values:
        for optimizer in HP_OPTIMIZER.domain.values:
            for learning_rate in HP_L_RATE.domain.values:
                
                skf = StratifiedKFold(n_splits = K_FOLDS, shuffle = True) 
                k_i = 0
                k_fold_models = []
                elapsed_times = []
                hparams = {
                    HP_NUM_UNITS: num_units,
                    HP_DROPOUT: dropout_rate,
                    HP_OPTIMIZER: optimizer,
                    HP_L_RATE: learning_rate
                }
                print('HPARAMS:',{h.name: hparams[h] for h in hparams})
                
                for train_index, val_index in skf.split(np.zeros(len(Y)),Y):
                    print('--- Starting trial: %s' % k_i)
                    t = time.process_time()
                    train_gen, valid_gen = preprocess_images_cv(train_index, val_index)

                    logdir = os.path.join('trained classifiers','logs','fit', get_model_name(k_i))
                    hpdir = os.path.join('trained classifiers','logs','hparam_tuning', get_model_name(k_i))
                    modeldir = os.path.join('trained classifiers',str(get_model_name(k_i)+".h5"))
                    callbacks = [
                        keras.callbacks.ModelCheckpoint(modeldir, save_best_only=True, verbose = 0),
                        keras.callbacks.EarlyStopping(patience=3, monitor='val_loss', verbose=1, restore_best_weights=True),
                        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1),
                        keras.callbacks.TensorBoard(log_dir=logdir, batch_size=BATCH_SIZE)
                      ]
                   
                    hp.hparams(hparams)  # record the values used in this trial
                    model = create_model(hparams)
                    model.fit(train_gen, validation_data=valid_gen, epochs=MAX_EPOCHS, callbacks=callbacks)
                    elapsed_times.append(time.process_time() - t)
                    k_fold_models.append(modeldir)
                    
                    # clear resources for each fold
                    del model, train_gen, valid_gen
                    keras.backend.clear_session()
                    k_i += 1
                    
                trained_models.append([hparams, k_fold_models, elapsed_times])

HPARAMS: {'num_units': 128, 'dropout': 0.1, 'optimizer': 'adam', 'learning_rate': 0.001}
--- Starting trial: 0
Found 1200 validated image filenames belonging to 2 classes.
Found 300 validated image filenames belonging to 2 classes.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/20
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00013: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 14/20
Epoch 00014: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 15/20

Epoch 00015: ReduceLRO

TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("Placeholder:0", shape=(?, ?, ?, ?), dtype=float32) is not an element of this graph.

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from statistics import mean

In [None]:
for j in trained_models:
    model = create_model(j[0])
    accuracies = []
    k_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    tp = []
    fp = []
    tn = []
    fn = []
    for i in j[1]:
        model.load_weights(i)
        test_pred = model.predict(test_gen)
        y_pred = np.rint(test_pred).flatten()
        y_test= [test_gen.class_indices[k] for k in test_df['class'].values.tolist()]
        k_accuracies.append(model.evaluate(test_gen)[1])
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        tn_x, fp_x, fn_x, tp_x = confusion_matrix(y_test, y_pred).ravel()
        tp.append(tp_x)
        fp.append(fp_x)
        tn.append(tn_x)
        fn.append(fn_x)

    print("SCORE FOR HPARAMS:", {h.name: j[0][h] for h in j[0]})
    print("Mean accuracy:", mean(accuracies))
    print("Mean k_accuracy;", mean(k_accuracies))
    print("Mean precision:", mean(precisions))
    print("Mean recall:", mean(recalls))
    print("Mean f1:", mean(f1s))
    print("Mean training time:", mean(j[2]))
    hdict = {h.name: j[0][h] for h in j[0]}
    df =  pd.DataFrame(np.array([[str(j[1]),
                                  hdict.get('num_units'), hdict.get('dropout'), hdict.get('optimizer'), 
                                  hdict.get('learning_rate'),IMAGE_SIZE, BATCH_SIZE, DATASET_SIZE, MAX_EPOCHS, K_FOLDS,
                                  mean(accuracies), mean(k_accuracies), mean(precisions), mean(recalls), mean(f1s),
                                  mean(tn), mean(fp), mean(fn), mean(tp), mean(j[2])
                                 ]]),
                       columns=['Model Names','num_units', 'dropout','optimizer','learning_rate',
                                'IMAGE_SIZE', 'BATCH_SIZE', 'DATASET_SIZE', 'MAX_EPOCHS',
                                'K_FOLDS','Accuracy', 'K_accuracy', 'Precision', 'Recall', 'f1 Score', 
                                'True Negatives', ' False Positives', 'False Negatives', 'True Positives', 'Training Time'])
    

In [None]:
for j in trained_models:
    model = create_model(j[0])
    accuracies = []
    k_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    tp = []
    fp = []
    tn = []
    fn = []
    for i in j[1]:
        model.load_weights(i)
        test_pred = model.predict(test_gen)
        y_pred = np.rint(test_pred).flatten()
        y_test= [test_gen.class_indices[k] for k in test_df['class'].values.tolist()]
        k_accuracies.append(model.evaluate(test_gen)[1])
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        tn_x, fp_x, fn_x, tp_x = confusion_matrix(y_test, y_pred).ravel()
        tp.append(tp_x)
        fp.append(fp_x)
        tn.append(tn_x)
        fn.append(fn_x)

    print("SCORE FOR HPARAMS:", {h.name: j[0][h] for h in j[0]})
    print("Mean accuracy:", mean(accuracies))
    print("Mean k_accuracy;", mean(k_accuracies))
    print("Mean precision:", mean(precisions))
    print("Mean recall:", mean(recalls))
    print("Mean f1:", mean(f1s))
    print("Mean training time:", mean(j[2]))
    hdict = {h.name: j[0][h] for h in j[0]}
    df =  pd.DataFrame(np.array([[str(j[1]),
                                  hdict.get('num_units'), hdict.get('dropout'), hdict.get('optimizer'), 
                                  hdict.get('learning_rate'),IMAGE_SIZE, BATCH_SIZE, DATASET_SIZE, MAX_EPOCHS, K_FOLDS,
                                  mean(accuracies), mean(k_accuracies), mean(precisions), mean(recalls), mean(f1s),
                                  mean(tn), mean(fp), mean(fn), mean(tp), mean(j[2])
                                 ]]),
                       columns=['Model Names','num_units', 'dropout','optimizer','learning_rate',
                                'IMAGE_SIZE', 'BATCH_SIZE', 'DATASET_SIZE', 'MAX_EPOCHS',
                                'K_FOLDS','Accuracy', 'K_accuracy', 'Precision', 'Recall', 'f1 Score', 
                                'True Negatives', ' False Positives', 'False Negatives', 'True Positives', 'Training Time'])
    with open('results/Xception-improved-v2.csv', 'a') as f:
        df.to_csv(f, mode='a', header=f.tell()==0, index = False)

NOTE: Accuracy and loss graphs can be seen on TensorBoard, run "%tensorboard --logdir logs --host 0.0.0.0" to open

## Save Weights and Results to Bucket

In [None]:
#!gsutil -m cp -r trained-classifiers gs://trained-classifiers/

In [None]:
#!gsutil -m cp -r model-checkpoints gs://trained-classifiers/