In [27]:
import sys
import sklearn
import os
import shutil
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import random as python_random
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)

# RESNET Round 3, Part 2
This model has already been trained for x-ray data, now we retrain on data that has been upsapmpled (for small classes) and downsampled (for large classes) to balance classes.

I've decided to get 5,000 records of each

## Upsample the data

In [12]:
# if the data above isn't run, run this

data_path = "../dl_data/"
class_names = os.listdir(data_path)
class_dist = {} # get the originial distribution of each class
f_names = {} # get list of file paths per class
for c in class_names:
    class_dist[c] = len(os.listdir(data_path + c))
    f_names[c] = os.listdir(data_path + c)
class_dist

{'Covid_img': 3250, 'Viral_img': 1211, 'Normal_img': 9174}

In [15]:
f_names['Covid_img'][:10]

['COVID-2402.png',
 'COVID-1270.png',
 'COVID-3070.png',
 'COVID-2019.png',
 'COVID-2463.png',
 'COVID-396.png',
 'COVID-3605.png',
 'COVID-256.png',
 'COVID-1215.png',
 'COVID-1649.png']

In [19]:
np.random.choice(f_names['Covid_img'], size=20, replace=True, p=None)

array(['COVID-2580.png', 'COVID-3026.png', 'COVID-118.png',
       'COVID-1079.png', 'COVID-1944.png', 'COVID-3587.png',
       'COVID-2915.png', 'COVID-3351.png', 'COVID-978.png',
       'COVID-986.png', 'COVID-3111.png', 'COVID-603.png',
       'COVID-359.png', 'COVID-2141.png', 'COVID-330.png',
       'COVID-1380.png', 'COVID-1447.png', 'COVID-3129.png',
       'COVID-2817.png', 'COVID-2560.png'], dtype='<U18')

In [20]:
# get samples
sample_paths = {}
for c in f_names:
    sample_paths[c] = np.random.choice(f_names[c], size=5000, replace=True, p=None)
sample_paths

{'Covid_img': array(['COVID-2382.png', 'COVID-3052.png', 'COVID-2420.png', ...,
        'COVID-1967.png', 'COVID-3399.png', 'COVID-2705.png'], dtype='<U18'),
 'Viral_img': array(['Viral Pneumonia-763.png', 'Viral Pneumonia-380.png',
        'Viral Pneumonia-42.png', ..., 'Viral Pneumonia-417.png',
        'Viral Pneumonia-548.png', 'Viral Pneumonia-265.png'], dtype='<U24'),
 'Normal_img': array(['Normal-837.png', 'Normal-8169.png', 'Normal-5229.png', ...,
        'Normal-1890.png', 'Normal-5930.png', 'Normal-9437.png'],
       dtype='<U18')}

In [21]:
for c in sample_paths:
    print(len(sample_paths[c]))

5000
5000
5000


In [23]:
# if you need to, use this to make a new directory

os.mkdir('../sample_data')
for c in class_names:
    os.mkdir('../sample_data/' + c)

In [48]:
# copy sampled files over
new_path = '../sample_data/'

for c in sample_paths:
    print(data_path + c)
    for i, p in enumerate(sample_paths[c]):
        shutil.copyfile(data_path + c + '/' + p, new_path + c + '/' + str(i) + '_' + p)

../dl_data/Covid_img
../dl_data/Viral_img
../dl_data/Normal_img


In [49]:
# Check that the new data has arrived
new_class_dist = {} # get the originial distribution of each class
for c in class_names:
    new_class_dist[c] = len(os.listdir(new_path + c))
new_class_dist

{'Covid_img': 5000, 'Viral_img': 5000, 'Normal_img': 5000}

In [46]:
# # if you need to start over
# for c in class_names:
#     all_files = os.listdir(new_path + c)
#     for f in all_files:
#         os.remove(new_path + c+ '/' + f)

## Load the data

In [51]:
from sklearn.datasets import load_files 
from keras.utils import np_utils

from keras.preprocessing import image



# directories
data_dir = "../sample_data/"

#### save out augmented data for visualization

# ## first delete any existing files
# aug_dir = '../augmented_data'
# aug_files = os.listdir(aug_dir)
# for f in aug_files:
#     os.remove(aug_dir + '/' + f)

    
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 224; 
img_width = 224;
    
    
# Test Dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "training",
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# Test Dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "validation",
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# # data augmentation (for training only)
# train_data_gen = ImageDataGenerator(rescale=1./255,
#                                     zoom_range= 0.3, 
#                                     horizontal_flip= True, 
#                                     shear_range= 0.2,
#                                     rotation_range = 30,
#                                     validation_split=0.2

#                                     )



# train_ds = train_data_gen.flow_from_directory(
#     directory = data_dir,
#     target_size=(img_height, img_width),
#     color_mode='rgb',
#     classes=None,
#     class_mode='categorical',
#     batch_size=batch_size,
#     shuffle=False,
#     seed=42,
# #     save_to_dir=aug_dir,
# #     save_prefix='aug',
# #     save_format='png',
#     follow_links=False,
#     subset='training',
#     interpolation='nearest'
# )

# validation_ds = train_data_gen.flow_from_directory(
#     directory=data_dir,  # same directory because we are splitting the data here
#     follow_links=False,
#     subset='validation',
#     interpolation='nearest',
#     target_size=(img_height, img_width), 
#     class_mode='categorical',
#     shuffle=False,
#     seed=42,
#     batch_size=batch_size
# )

# class_ind = (train_ds.class_indices)

# test_data_gen = ImageDataGenerator(rescale=1./255)


# # holdout data
# HOLD_ds = test_data_gen.flow_from_directory(directory=HOLD_dir, 
#                                          target_size=(img_height, img_width), 
#                                          class_mode='categorical',
#                                          shuffle=False,
#                                          seed=42,
#                                          batch_size=batch_size)

Found 15000 files belonging to 3 classes.
Using 12000 files for training.
Found 15000 files belonging to 3 classes.
Using 3000 files for validation.


In [None]:
# class_ind

# scikitlearn funciton for recall/precision etc. scikitlearn.metrics
#train on accuracy

In [None]:
## this is just a bug fix, hopefully I won't need to use it again.

# fi = os.listdir(aug_dir + '/' + os.listdir(aug_dir)[0])
# for f in fi:
#     os.remove(aug_dir + '/' + os.listdir(aug_dir)[0] + '/' + f)

# os.rmdir(aug_dir + '/' + os.listdir(aug_dir)[0])

In [53]:
# set checkpoint to resume training if it stops unexpectedly
checkpoint_path = "../checkpoints/training_ROUND3_part2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [58]:
# ds_size_1 = (224, 224)
# # train_ds_1 = train_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))
# # validation_ds_1 = validation_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))

# train_ds_1 = train_ds
# validation_ds_1 = validation_ds


# base_model_2 = tf.keras.models.load_model('./saved_models/model_ROUND3')
# n_classes = 3

# callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# # Rebuild top
# # x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(base_model_2.output)
# # x = tf.keras.layers.BatchNormalization()(x)

# top_dropout_rate = 0.2
# x = tf.keras.layers.Dropout(top_dropout_rate, name="top_dropout")#(x)
# # x = tf.keras.layers.Flatten()(x)
# outputs = tf.keras.layers.Dense(3, activation="softmax", name="pred")(x) # match number of classes

# model_2 = keras.models.Model(inputs=base_model_2.input,
#                            outputs=outputs)

TypeError: Inputs to a layer should be tensors. Got: <keras.layers.core.dropout.Dropout object at 0x7f3852082a60>

In [62]:
# train up the top layer first

model_2 = tf.keras.models.load_model('./saved_models/model_ROUND3')

# for layer in model_2.layers:
#     layer.trainable = False



# recall = tf.keras.metrics.Recall()
optimizer = keras.optimizers.Adam(learning_rate=0.01, decay=0.01)
model_2.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model_2.fit(train_ds_1,
                    validation_data=validation_ds_1,
#                     class_weight=class_weights,
                    epochs=20, callbacks=[callback,cp_callback])


Epoch 1/20
Epoch 1: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 2/20
Epoch 2: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 3/20
Epoch 3: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 4/20
Epoch 4: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 5/20
Epoch 5: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 6/20
Epoch 6: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 7/20
Epoch 7: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 8/20
Epoch 8: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 9/20
Epoch 9: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 10/20
Epoch 10: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 11/20
Epoch 11: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 12/20
Epoch 12: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt
Epoch 13/20
Epoch 13: 

NameError: name 'loaded_model' is not defined

In [64]:
history = model_2.fit(train_ds_1,
                    validation_data=validation_ds_1,
#                     class_weight=class_weights,
                    epochs=1, callbacks=[callback,cp_callback])

Epoch 1: saving model to ../checkpoints/training_ROUND3_part2/cp.ckpt


In [10]:
# opt = tf.keras.optimizers.Adam(0.1)
# net = Net()
# dataset = toy_dataset()
# iterator = iter(dataset)
# ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=net, iterator=iterator)
# manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

# train_and_checkpoint(net, manager)

In [None]:
# # train all the layers together for a bit with a much lower learning rate

# for layer in base_model_2.layers[-20:]:
#     if not isinstance(layer, tf.keras.layers.BatchNormalization):
#         layer.trainable = True

# recall = tf.keras.metrics.Recall()
# optimizer = keras.optimizers.Adam(learning_rate=0.0004, decay=0.001)
# model_2.compile(loss="categorical_crossentropy", optimizer=optimizer,
#               metrics=["accuracy"])
# history = model_2.fit(train_ds_1,
#                     validation_data=validation_ds_1,
# #                     class_weight=class_weights,
#                     epochs=50, callbacks=[callback,cp_callback])

In [None]:
# # train all the layers together for a bit with a much lower learning rate

# for layer in base_model_2.layers[-20:]:
#     if not isinstance(layer, tf.keras.layers.BatchNormalization):
#         layer.trainable = True

# recall = tf.keras.metrics.Recall()
# optimizer = keras.optimizers.Adam(learning_rate=0.0004, decay=0.001)
# model_2.compile(loss="categorical_crossentropy", optimizer=optimizer,
#               metrics=["accuracy"])
# history = model_2.fit(train_ds_1,
#                     validation_data=validation_ds_1,
# #                     class_weight=class_weights,
#                     epochs=1, callbacks=[callback,cp_callback])

In [65]:
# save the model
model_2.save('saved_models/model_ROUND3_part2') # change this path to save a new version

INFO:tensorflow:Assets written to: saved_models/model_ROUND3_part2/assets


In [None]:
# # if you need to use the checkpoint, use this code
# # source: https://www.tensorflow.org/tutorials/keras/save_and_load#checkpoint_callback_options

# latest = tf.train.latest_checkpoint(checkpoint_dir)
# latest

# # Create a new model instance
# model_2 = create_model()

# # Load the previously saved weights
# model_2.load_weights(latest)

# # Re-evaluate the model
# loss, acc = model_2.evaluate(validation_ds_1 verbose=2)
# print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

In [None]:
! ls 

To Do:

Oversampling/Data Augmentation:

1. start a new file with clear labels, resampling, augmented data
2. Train the model the same way
3. Save model and create confusion matrix in this file (or seperate file)

Prediction weights
1. When predicting classes, change wieghts until we get 100% for covid cases
2. Change to proportional CM instead of just numeric?

Recall and F-score as metric?