In [8]:
import sys
import sklearn
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import random as python_random
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)



#### Questions:
# How does tf (under this method) store labels? Can I access them?
#    Look into how the resizing line is done
# What happens if I don't pass the y value into imageDataGenerator().flow()?
# is this enough work? (oversampling, data augmentation, adusting the prediction wieghts)
# multiple expert -  3 different model (majority voting at the end to make predictions) - cost trade off for computing
# - talk about costs (training time, different work, etc)
# - accuracy vs tradeoffs (latency of making one prediction)

## Class Analysis

In [9]:
!ls

README.md	  jordan_cnn-manual-split.ipynb  resnet-jordan.ipynb
checkpoints	  jordan_cnn.ipynb		 resnet.ipynb
dl_load.ipynb	  matrix1.png			 resnet_v2.ipynb
evaluation.ipynb  matrix2.png			 saved_models


In [10]:
data_path = '../dl_data/'
class_names = os.listdir(data_path)
class_dist = {}
for c in class_names:
    class_dist[c] = len(os.listdir(data_path + c))
    print(c)
    print(class_dist[c])

Covid_img
3611
Viral_img
1345
Normal_img
10193


In [11]:
import math

def pull_val(dir_path, val_path, portion):
    # get a list of files to hold out for validation
    files = os.listdir(dir_path)
    length = len(files)
    num_files = math.floor(length * portion)
    val_files = np.random.choice(files, size=num_files, replace=False)

    # move files
    for f in val_files:
        os.rename(dir_path + '/' + f, val_path + '/' + f)

In [12]:
# if you need to, use this to make a new directory

# os.mkdir('../HOLD_data')
# os.mkdir('../test_data')
# for c in class_names:
#     os.mkdir('../HOLD_data/' + c)
#     os.mkdir('../test_data/' + c)

In [13]:
# save out holdout and testing data

portion = .10 # portion of data set aside for HOLDOUT
    
for c in class_names:
    dir_path = '../dl_data/' + c
    val_path = '../HOLD_data/' + c
    pull_val(dir_path, val_path, portion)
    
##### Commented out because we will do validation split with the image gen function    
# portion = .20 # portion of data set aside for TESTING
    
# for c in class_names:
#     dir_path = '../dl_data/' + c
#     val_path = '../test_data/' + c
#     pull_val(dir_path, val_path, portion)

In [14]:
# to reset holdout

# for c in class_names:
#     val_files = os.listdir('../HOLD_data/' + c)
#     for i in val_files:
#         os.rename('../HOLD_data/' + c + '/' + i, '../dl_data/' + c + '/' + i)
        
# for c in class_names:
#     val_files = os.listdir('../test_data/' + c)
#     for i in val_files:
#         os.rename('../test_data/' + c + '/' + i, '../dl_data/' + c + '/' + i)

In [15]:
print('---Train set-------------------')
for c in class_names:
    print(c)
    print(len(os.listdir('../dl_data/'+c)))
    
print('---Teset set-------------------')
for c in class_names:
    print(c)
    print(len(os.listdir('../test_data/'+c)))
    
print('---Holdout set-------------------')
for c in class_names:
    print(c)
    print(len(os.listdir('../HOLD_data/'+c)))
    
print('---Total-------------------')
for c in class_names:
    print(c)
    print(len(os.listdir('../dl_data/'+c)) + len(os.listdir('../HOLD_data/'+c)) + len(os.listdir('../test_data/'+c)))

---Train set-------------------
Covid_img
3250
Viral_img
1211
Normal_img
9174
---Teset set-------------------
Covid_img
0
Viral_img
0
Normal_img
0
---Holdout set-------------------
Covid_img
361
Viral_img
134
Normal_img
1019
---Total-------------------
Covid_img
3611
Viral_img
1345
Normal_img
10193


## Load Data

In [16]:
# if the data above isn't run, run this

data_path = "../dl_data/"
class_names = os.listdir(data_path)
class_dist = {}
for c in class_names:
    class_dist[c] = len(os.listdir(data_path + c))

In [17]:
class_names

['Covid_img', 'Viral_img', 'Normal_img']

In [23]:
from sklearn.datasets import load_files 
from keras.utils import np_utils

from keras.preprocessing import image

#### calculate class weights

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
total = sum(class_dist.values())
weight_for_0 = (1 / class_dist[class_names[0]]) * (total / 2.0)
weight_for_1 = (1 / class_dist[class_names[1]]) * (total / 2.0)
weight_for_2 = (1 / class_dist[class_names[2]]) * (total / 2.0)

class_weights = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

# directories
data_dir = "../dl_data"
# test_dir = "../test_data"
HOLD_dir = "../HOLD_data"


#### save out augmented data for visualization

# ## first delete any existing files
# aug_dir = '../augmented_data'
# aug_files = os.listdir(aug_dir)
# for f in aug_files:
#     os.remove(aug_dir + '/' + f)

    
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 224; 
img_width = 224;
    

# data augmentation (for training only)
train_data_gen = ImageDataGenerator(rescale=1./255,
                                    zoom_range= 0.3, 
                                    horizontal_flip= True, 
                                    shear_range= 0.2,
                                    rotation_range = 30,
                                    validation_split=0.2
                                    
                                    
#                                     featurewise_center=False,
#                                     samplewise_center=False,
#                                     featurewise_std_normalization=False,
#                                     samplewise_std_normalization=False,
#                                     zca_whitening=False,
#                                     zca_epsilon=1e-06,
#                                     rotation_range=0,
#                                     width_shift_range=0.0,
#                                     height_shift_range=0.0,
#                                     brightness_range=None,
#                                     shear_range=0.0,
#                                     zoom_range=0.0,
#                                     channel_shift_range=0.0,
#                                     fill_mode='nearest',
#                                     cval=0.0,
#                                     horizontal_flip=False,
#                                     vertical_flip=False,
#                                     rescale=None,
#                                     preprocessing_function=None,
#                                     data_format=None,
#                                     validation_split=0.2,
#                                     dtype=None
                                    )


train_ds = train_data_gen.flow_from_directory(
    directory = data_dir,
    target_size=(img_height, img_width),
    color_mode='rgb',
    classes=None,
    class_mode='categorical',
    batch_size=batch_size,
    shuffle=True,
    seed=42,
#     save_to_dir=aug_dir,
#     save_prefix='aug',
#     save_format='png',
    follow_links=False,
    subset='training',
    interpolation='nearest'
)

validation_ds = train_data_gen.flow_from_directory(
    directory=data_dir,  # same directory because we are splitting the data here
    follow_links=False,
    subset='validation',
    interpolation='nearest',
    target_size=(img_height, img_width), 
    class_mode='categorical',
    shuffle=True,
    seed=42,
    batch_size=batch_size
)


test_data_gen = ImageDataGenerator(rescale=1./255)


# holdout data
HOLD_ds = test_data_gen.flow_from_directory(directory=HOLD_dir, 
                                         target_size=(img_height, img_width), 
                                         class_mode='categorical',
                                         shuffle=True,
                                         seed=42,
                                         batch_size=batch_size)

Found 10912 images belonging to 3 classes.
Found 2726 images belonging to 3 classes.
Found 1514 images belonging to 3 classes.


In [24]:
## this is just a bug fix, hopefully I won't need to use it again.

# fi = os.listdir(aug_dir + '/' + os.listdir(aug_dir)[0])
# for f in fi:
#     os.remove(aug_dir + '/' + os.listdir(aug_dir)[0] + '/' + f)

# os.rmdir(aug_dir + '/' + os.listdir(aug_dir)[0])

In [26]:
# set checkpoint to resume training if it stops unexpectedly
checkpoint_path = "../checkpoints/training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [27]:
ds_size_1 = (224, 224)
# train_ds_1 = train_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))
# validation_ds_1 = validation_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))

train_ds_1 = train_ds
validation_ds_1 = validation_ds


base_model_2 = keras.applications.ResNet50(weights='imagenet', include_top=False)
n_classes = 3

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# Rebuild top
x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(base_model_2.output)
x = tf.keras.layers.BatchNormalization()(x)

top_dropout_rate = 0.2
x = tf.keras.layers.Dropout(top_dropout_rate, name="top_dropout")(x)
# x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation="softmax", name="pred")(x) # match number of classes

model_2 = keras.models.Model(inputs=base_model_2.input,
                           outputs=outputs)

In [29]:
# train up the top layer first

for layer in base_model_2.layers:
    layer.trainable = False

recall = tf.keras.metrics.Recall
optimizer = keras.optimizers.Adam(learning_rate=0.01, decay=0.01)
model_2.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=[recall])
history = model_2.fit(train_ds_1,
                    validation_data=validation_ds_1,
#                     class_weight=class_weights,
                    epochs=20, callbacks=[callback,cp_callback])

Epoch 1/20
Epoch 1: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 2/20
Epoch 2: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 3/20
Epoch 3: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 4/20
Epoch 4: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 5/20
Epoch 5: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 6/20
Epoch 6: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 7/20
Epoch 7: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 8/20
Epoch 8: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 9/20
Epoch 9: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 10/20
Epoch 10: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 11/20
Epoch 11: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 12/20
Epoch 12: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 13/20
Epoch 13: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 14/20
Epoch 14: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 15/2

In [30]:
# opt = tf.keras.optimizers.Adam(0.1)
# net = Net()
# dataset = toy_dataset()
# iterator = iter(dataset)
# ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=net, iterator=iterator)
# manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

# train_and_checkpoint(net, manager)

In [None]:
# train all the layers together for a bit with a much lower learning rate

for layer in base_model_2.layers[-20:]:
    if not isinstance(layer, tf.keras.layers.BatchNormalization):
        layer.trainable = True

recall = tf.keras.metrics.Recall
optimizer = keras.optimizers.Adam(learning_rate=0.0004, decay=0.001)
model_2.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=[recall])
history = model_2.fit(train_ds_1,
                    validation_data=validation_ds_1,
#                     class_weight=class_weights,
                    epochs=50, callbacks=[callback,cp_callback])

Epoch 1/50
Epoch 1: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 2/50
Epoch 2: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 3/50
Epoch 3: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 4/50
Epoch 4: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 5/50
Epoch 5: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 6/50
Epoch 6: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 7/50
Epoch 7: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 8/50
Epoch 8: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 9/50
Epoch 9: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 10/50
Epoch 10: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 11/50
Epoch 11: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 12/50
Epoch 12: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 13/50
Epoch 13: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 14/50
Epoch 14: saving model to ../checkpoints/training_2/cp.ckpt
Epoch 15/5

In [1]:
# save the model
model_2.save('saved_models/model_RECALL') # change this path to save a new version

NameError: name 'model_2' is not defined

In [None]:
# # if you need to use the checkpoint, use this code
# # source: https://www.tensorflow.org/tutorials/keras/save_and_load#checkpoint_callback_options

# latest = tf.train.latest_checkpoint(checkpoint_dir)
# latest

# # Create a new model instance
# model_2 = create_model()

# # Load the previously saved weights
# model_2.load_weights(latest)

# # Re-evaluate the model
# loss, acc = model_2.evaluate(validation_ds_1 verbose=2)
# print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

In [None]:
! ls 

To Do:

Oversampling/Data Augmentation:

1. start a new file with clear labels, resampling, augmented data
2. Train the model the same way
3. Save model and create confusion matrix in this file (or seperate file)

Prediction weights
1. When predicting classes, change wieghts until we get 100% for covid cases
2. Change to proportional CM instead of just numeric?

Recall and F-score as metric?