## Dataset Download

In [None]:
!gsutil -m cp -r gs://covidx-bucket/covidx-cxr2/ ./

## Model Variables

In [1]:
IMAGE_SIZE = 224
BATCH_SIZE = 64
DATASET_SIZE = 27000
MAX_EPOCHS = 20
K_FOLDS = 5

## Data Preparation

In [3]:
# import required libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
from sklearn.metrics import confusion_matrix, classification_report
from datetime import datetime
from packaging import version
%reload_ext tensorboard
import tensorboard
from sklearn.model_selection import StratifiedKFold
from tensorboard.plugins.hparams import api as hp
import time
from pathlib import Path

print("Tensorflow Version:", tf.__version__)
print("Tensorboard Version:", tensorboard.__version__)
print("GPUs Found:", tf.config.list_physical_devices('GPU'))

Tensorflow Version: 2.8.0
Tensorboard Version: 2.8.0
GPUs Found: []


In [14]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([512,1024]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.2, 0.4]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))
HP_L_RATE= hp.HParam('learning_rate', hp.Discrete([0.0005, 0.001]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('gs://trained-classifiers/logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER, HP_L_RATE],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
    )

In [12]:
# read in train data

train_df = pd.read_csv('covidx-cxr2/train_COVIDx9B.txt', 
                       sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
train_df.columns=['patient id', 'filename', 'class', 'data source']
# drop patient id and datasource as not needed
train_df=train_df.drop(['patient id', 'data source'], axis=1 )

# read in test data
test_df = pd.read_csv('covidx-cxr2/test_COVIDx9B.txt', 
                      sep=" ", header=None)
# add columns to go from 0, 1, 2, 3 to patient id, filename, class etc
test_df.columns=['patient id', 'filename', 'class', 'data source']
# drop patient id and datasource as not needed
test_df=test_df.drop(['patient id', 'data source'], axis=1 )

In [None]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    16490
negative    13992
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [6]:
negative  = train_df[train_df['class']=='negative']   # normal values in class column
positive = train_df[train_df['class']=='positive']  # COVID-19 values in class column

from sklearn.utils import resample
# downsample training data to 400 values of each class, to reduce class bias and reduce training time

df_negative_downsampled = resample(negative, replace = True, n_samples = DATASET_SIZE//2)
df_positive_downsampled = resample(positive, replace = True, n_samples = DATASET_SIZE//2) 

#concatenate
train_df = pd.concat([df_negative_downsampled, df_positive_downsampled])

from sklearn.utils import shuffle
train_df = shuffle(train_df) # shuffling so that there is particular sequence
print("Train class counts:")
print(train_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64


In [7]:
print("Train class counts:")
print(train_df['class'].value_counts())
print("\nTest class counts:")
print(test_df['class'].value_counts())

Train class counts:
positive    750
negative    750
Name: class, dtype: int64

Test class counts:
positive    200
negative    200
Name: class, dtype: int64


In [8]:
# preprocess images
test_datagen = ImageDataGenerator()

test_gen = test_datagen.flow_from_dataframe(dataframe = test_df, directory="covidx-cxr2/test", x_col='filename',
                                            y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), batch_size=BATCH_SIZE, 
                                            color_mode='rgb', class_mode='binary', shuffle=False)

Found 400 validated image filenames belonging to 2 classes.


In [9]:
def preprocess_images_cv(train_index, val_index):
    training_data = train_df.iloc[train_index]
    validation_data = train_df.iloc[val_index]
    train_datagen = ImageDataGenerator(rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, 
                                       shear_range = 0.2, zoom_range = 0.1, horizontal_flip = True)

    #Now fit the them to get the images from directory (name of the images are given in dataframe) with augmentation

    train_gen = train_datagen.flow_from_dataframe(dataframe = training_data, directory="covidx-cxr2/train", 
                                                  x_col='filename', y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), 
                                                  batch_size=BATCH_SIZE, color_mode='rgb', class_mode='binary')
    valid_gen = test_datagen.flow_from_dataframe(dataframe = validation_data, directory="covidx-cxr2/train", 
                                                 x_col='filename', y_col='class', target_size=(IMAGE_SIZE, IMAGE_SIZE), 
                                                 batch_size=BATCH_SIZE,  color_mode='rgb', class_mode='binary')
    return train_gen, valid_gen


## AlexNet

In [10]:
# required libraries for ResNet
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization

In [11]:
def create_model(hparams):
    #Instantiation
    AlexNet = Sequential()

    #1st Convolutional Layer
    AlexNet.add(Conv2D(filters=96, input_shape=(IMAGE_SIZE, IMAGE_SIZE,3), kernel_size=(11,11), strides=(4,4), padding='same'))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    #2nd Convolutional Layer
    AlexNet.add(Conv2D(filters=256, kernel_size=(5, 5), strides=(1,1), padding='same'))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    #3rd Convolutional Layer
    AlexNet.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))

    #4th Convolutional Layer
    AlexNet.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))

    #5th Convolutional Layer
    AlexNet.add(Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding='same'))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    #Passing it to a Fully Connected layer
    AlexNet.add(Flatten())
    # 1st Fully Connected Layer
    AlexNet.add(Dense(4096, input_shape=(IMAGE_SIZE, IMAGE_SIZE,3,)))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    # Add Dropout to prevent overfitting
    AlexNet.add(Dropout(hparams[HP_DROPOUT]))

    #2nd Fully Connected Layer
    AlexNet.add(Dense(4096))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    #Add Dropout
    AlexNet.add(Dropout(hparams[HP_DROPOUT]))

    #3rd Fully Connected Layer
    AlexNet.add(Dense(hparams[HP_NUM_UNITS]))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    #Add Dropout
    AlexNet.add(Dropout(hparams[HP_DROPOUT]))

    #Output Layer
    AlexNet.add(Dense(1))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('softmax'))
    
    optimizer_name = hparams[HP_OPTIMIZER]
    learning_rate = hparams[HP_L_RATE]
    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        raise ValueError("unexpected optimizer name: %r" % (optimizer_name,))


    AlexNet.compile(optimizer = optimizer,
                    loss = 'binary_crossentropy',
                    metrics=['accuracy'])
    
    return AlexNet


In [12]:
def get_model_name(k):
    return str("AlexNet-Binary-" + "Fold-"+str(k)+"-"+datetime.now().strftime("%Y%m%d-%H%M%S"))

In [13]:
# perform HP tuning and CV on the model on the data
Y = train_df['class']
trained_models = []

for num_units in HP_NUM_UNITS.domain.values:
    for dropout_rate in HP_DROPOUT.domain.values:
        for optimizer in HP_OPTIMIZER.domain.values:
            for learning_rate in HP_L_RATE.domain.values:
                
                skf = StratifiedKFold(n_splits = K_FOLDS, shuffle = True) 
                k_i = 0
                k_fold_models = []
                elapsed_times = []
                hparams = {
                    HP_NUM_UNITS: num_units,
                    HP_DROPOUT: dropout_rate,
                    HP_OPTIMIZER: optimizer,
                    HP_L_RATE: learning_rate
                }
                print('HPARAMS:',{h.name: hparams[h] for h in hparams})
                
                for train_index, val_index in skf.split(np.zeros(len(Y)),Y):
                    print('--- Starting trial: %s' % k_i)
                    t = time.process_time()
                    train_gen, valid_gen = preprocess_images_cv(train_index, val_index)

                    logdir = os.path.join('gs://trained-classifiers','logs','fit', get_model_name(k_i))
                    hpdir = os.path.join('gs://trained-classifiers','logs','hparam_tuning', get_model_name(k_i))
                    modeldir = os.path.join('gs://trained-classifiers','trained classifiers',str(get_model_name(k_i)+".h5"))
                    callbacks = [
                        keras.callbacks.ModelCheckpoint(modeldir, save_best_only=True, verbose = 0),
                        keras.callbacks.EarlyStopping(patience=3, monitor='val_loss', verbose=1, restore_best_weights=True),
                        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1),
                        keras.callbacks.TensorBoard(log_dir=logdir)
                      ]
                   
                    hp.hparams(hparams)  # record the values used in this trial
                    model = create_model(hparams)
                    model.fit(train_gen, validation_data=valid_gen, epochs=MAX_EPOCHS, callbacks=callbacks)
                    elapsed_times.append(time.process_time() - t)
                    k_fold_models.append(modeldir)
                    
                    del model, train_gen, valid_gen
                    keras.backend.clear_session()
                    k_i += 1
                    
                trained_models.append([hparams, k_fold_models, elapsed_times])

HPARAMS: {'num_units': 512, 'dropout': 0.2, 'optimizer': 'adam', 'learning_rate': 0.0005}
--- Starting trial: 0
Found 750 non-validated image filenames belonging to 2 classes.
Found 750 non-validated image filenames belonging to 2 classes.


2022-03-04 14:49:28.832882: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 205520896 exceeds 10% of free system memory.
2022-03-04 14:49:29.020817: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 205520896 exceeds 10% of free system memory.
2022-03-04 14:49:29.104531: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 205520896 exceeds 10% of free system memory.


Epoch 1/20


2022-03-04 14:49:32.309261: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 205520896 exceeds 10% of free system memory.
2022-03-04 14:49:32.444598: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 205520896 exceeds 10% of free system memory.




KeyboardInterrupt: 

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statistics import mean

In [None]:
for j in trained_models:
    model = create_model(j[0])
    accuracies = []
    k_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    for i in j[1]:
        model.load_weights(i)
        test_pred = model.predict(test_gen)
        y_pred = np.rint(test_pred).flatten()
        y_test= [test_gen.class_indices[k] for k in test_df['class'].values.tolist()]
        k_accuracies.append(model.evaluate(test_gen)[1])
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))

    print("SCORE FOR HPARAMS:", {h.name: j[0][h] for h in j[0]})
    print("Mean accuracy:", mean(accuracies))
    print("Mean k_accuracy;", mean(k_accuracies))
    print("Mean precision:", mean(precisions))
    print("Mean recall:", mean(recalls))
    print("Mean f1:", mean(f1s))
    print("Mean training time:", mean(j[2]))
    hdict = {h.name: j[0][h] for h in j[0]}
    df =  pd.DataFrame(np.array([[str(j[1]),
                                  hdict.get('num_units'), hdict.get('dropout'), hdict.get('optimizer'), 
                                  hdict.get('learning_rate'),IMAGE_SIZE, BATCH_SIZE, DATASET_SIZE, MAX_EPOCHS, K_FOLDS,
                                  mean(accuracies), mean(k_accuracies), mean(precisions), mean(recalls), mean(f1s), mean(j[2])
                                 ]]),
                       columns=['Model Names','num_units', 'dropout','optimizer','learning_rate',
                                'IMAGE_SIZE', 'BATCH_SIZE', 'DATASET_SIZE', 'MAX_EPOCHS',
                                'K_FOLDS','Accuracy', 'K_accuracy', 'Precision', 'Recall', 'f1 Score', 'Training Time'])
    with open('gs://trained-classifiers/results.csv', 'a') as f:
        df.to_csv(f, mode='a', header=f.tell()==0, index = False)

NOTE: Accuracy and loss graphs can be seen on TensorBoard, run "%tensorboard --logdir logs --host 0.0.0.0" to open