In [41]:
import sys
import sklearn
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import random as python_random
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)



#### Questions:
# How does tf (under this method) store labels? Can I access them?
#    Look into how the resizing line is done
# What happens if I don't pass the y value into imageDataGenerator().flow()?
# is this enough work? (oversampling, data augmentation, adusting the prediction wieghts)
# multiple expert -  3 different model (majority voting at the end to make predictions) - cost trade off for computing
# - talk about costs (training time, different work, etc)
# - accuracy vs tradeoffs (latency of making one prediction)

## Class Analysis

In [2]:
!ls

 covid-xray-classification   dl_data_2		     resnet_v2.ipynb
 dl_data		     dl_load.ipynb	     saved_models
 dl_data.zip		    'jordan_cnn (2).ipynb'


In [7]:
!pwd

/sfs/qumulo/qhome/hso6b/DL


In [11]:
covidDataPath='dl_data_2/dl_data/Covid_img'
pneumoniaDataPath='dl_data_2/dl_data/Viral_img'
normalDataPath='dl_data_2/dl_data/Normal_img'

In [12]:
# Lists for access paths
listCovidPaths = []
listViralPaths=[]
listNormalPaths = []

# Get covid images files paths
for root, directories, files in os.walk(covidDataPath):
    for name in files:
        listCovidPaths.append(os.path.join(root, name))

for root, directories, files in os.walk(pneumoniaDataPath):
    for name in files:
        listViralPaths.append(os.path.join(root, name))        
        
# Get normal images files paths
for root, directories, files in os.walk(normalDataPath):
    for name in files:
        listNormalPaths.append(os.path.join(root, name))

In [42]:
data_path = 'dl_data_2/dl_data/'
class_names = os.listdir(data_path)

In [51]:
class_names

['Covid_img', '.ipynb_checkpoints', 'Viral_img', 'Normal_img']

In [52]:
class_names[1]

'.ipynb_checkpoints'

In [34]:
class_names = class_names[-3:]
class_names

['Covid_img', 'Viral_img', 'Normal_img']

In [35]:
class_dist = {}
for c in class_names:
    class_dist[c] = len(os.listdir(data_path + c))
    print(c)
    print(class_dist[c])

Covid_img
3610
Viral_img
1345
Normal_img
10192


In [31]:
import math

def pull_val(dir_path, val_path, portion):
    # get a list of files to hold out for validation
    files = os.listdir(dir_path)
    length = len(files)
    num_files = math.floor(length * portion)
    val_files = np.random.choice(files, size=num_files, replace=False)

    # move files
    for f in val_files:
        os.rename(dir_path + '/' + f, val_path + '/' + f)

In [36]:
# if you need to, use this to make a new directory

os.mkdir('dl_data_2/HOLD_data')
os.mkdir('dl_data_2/test_data')
for c in class_names:
    os.mkdir('dl_data_2/HOLD_data/' + c)
    os.mkdir('dl_data_2/test_data/' + c)

In [37]:
# save out holdout and testing data

portion = .10 # portion of data set aside for HOLDOUT
    
for c in class_names:
    dir_path = 'dl_data_2/dl_data/' + c
    val_path = 'dl_data_2/HOLD_data/' + c
    pull_val(dir_path, val_path, portion)
    
##### Commented out because we will do validation split with the image gen function    
# portion = .20 # portion of data set aside for TESTING
    
# for c in class_names:
#     dir_path = '../dl_data/' + c
#     val_path = '../test_data/' + c
#     pull_val(dir_path, val_path, portion)

## Load Data

In [53]:
from sklearn.datasets import load_files 
from keras.utils import np_utils

from keras.preprocessing import image

#### calculate class weights

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
total = sum(class_dist.values())
weight_for_0 = (1 / class_dist[class_names[0]]) * (total / 2.0)
weight_for_1 = (1 / class_dist[class_names[2]]) * (total / 2.0)
weight_for_2 = (1 / class_dist[class_names[3]]) * (total / 2.0)

class_weights = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

# directories
data_dir = "dl_data_2/dl_data"
# test_dir = "../test_data"
HOLD_dir = "dl_data_2/HOLD_data"


#### save out augmented data for visualization

# ## first delete any existing files
# aug_dir = '../augmented_data'
# aug_files = os.listdir(aug_dir)
# for f in aug_files:
#     os.remove(aug_dir + '/' + f)

    
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 224; 
img_width = 224;
    

# data augmentation (for training only)
train_data_gen = ImageDataGenerator(rescale=1./255,
                                    zoom_range= 0.3, 
                                    horizontal_flip= True, 
                                    shear_range= 0.2,
                                    rotation_range = 30,
                                    validation_split=0.2
                                    
                                    
#                                     featurewise_center=False,
#                                     samplewise_center=False,
#                                     featurewise_std_normalization=False,
#                                     samplewise_std_normalization=False,
#                                     zca_whitening=False,
#                                     zca_epsilon=1e-06,
#                                     rotation_range=0,
#                                     width_shift_range=0.0,
#                                     height_shift_range=0.0,
#                                     brightness_range=None,
#                                     shear_range=0.0,
#                                     zoom_range=0.0,
#                                     channel_shift_range=0.0,
#                                     fill_mode='nearest',
#                                     cval=0.0,
#                                     horizontal_flip=False,
#                                     vertical_flip=False,
#                                     rescale=None,
#                                     preprocessing_function=None,
#                                     data_format=None,
#                                     validation_split=0.2,
#                                     dtype=None
                                    )


train_ds = train_data_gen.flow_from_directory(
    directory = data_dir,
    target_size=(img_height, img_width),
    color_mode='rgb',
    classes=None,
    class_mode='categorical',
    batch_size=batch_size,
    shuffle=True,
    seed=42,
#     save_to_dir=aug_dir,
#     save_prefix='aug',
#     save_format='png',
    follow_links=False,
    subset='training',
    interpolation='nearest'
)

validation_ds = train_data_gen.flow_from_directory(
    directory=data_dir,  # same directory because we are splitting the data here
    follow_links=False,
    subset='validation',
    interpolation='nearest',
    target_size=(img_height, img_width), 
    class_mode='categorical',
    shuffle=True,
    seed=42,
    batch_size=batch_size
)


test_data_gen = ImageDataGenerator(rescale=1./255)


# holdout data
HOLD_ds = test_data_gen.flow_from_directory(directory=HOLD_dir, 
                                         target_size=(img_height, img_width), 
                                         class_mode='categorical',
                                         shuffle=True,
                                         seed=42,
                                         batch_size=batch_size)

Found 10908 images belonging to 3 classes.
Found 2725 images belonging to 3 classes.
Found 1514 images belonging to 3 classes.


In [6]:
# set checkpoint to resume training if it stops unexpectedly
checkpoint_path = "../checkpoints/training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

## ResNet

In [55]:
ds_size_1 = (224, 224)
# train_ds_1 = train_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))
# validation_ds_1 = validation_ds.map(lambda image, label: (tf.image.resize(image, ds_size_1), label))

train_ds_1 = train_ds
validation_ds_1 = validation_ds


base_model_2 = keras.applications.ResNet50(weights='imagenet', include_top=False)
n_classes = 3

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# Rebuild top
x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(base_model_2.output)
x = tf.keras.layers.BatchNormalization()(x)

top_dropout_rate = 0.2
x = tf.keras.layers.Dropout(top_dropout_rate, name="top_dropout")(x)
# x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation="softmax", name="pred")(x) # match number of classes

model_2 = keras.models.Model(inputs=base_model_2.input,
                           outputs=outputs)

2022-04-22 13:18:33.884781: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-22 13:18:36.317212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10788 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7


In [56]:
# train up the top layer first

for layer in base_model_2.layers:
    layer.trainable = False

optimizer = keras.optimizers.Adam(learning_rate=0.01, decay=0.01)
model_2.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model_2.fit(train_ds_1,
                    validation_data=validation_ds_1,
                    class_weight=class_weights,
                    epochs=20, callbacks=[callback,cp_callback])

NameError: name 'cp_callback' is not defined

## VGGNet

In [58]:
base_model_1 = keras.applications.VGG19(weights='imagenet', include_top=False)
n_classes = 3

# Rebuild top
x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(base_model_1.output)
x = tf.keras.layers.BatchNormalization()(x)

top_dropout_rate = 0.2
x = tf.keras.layers.Dropout(top_dropout_rate, name="top_dropout")(x)
# x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation="softmax", name="pred")(x) # match number of classes

model_1 = keras.models.Model(inputs=base_model_1.input,
                           outputs=outputs)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


In [59]:
# train up the top layer first

for layer in base_model_1.layers:
    layer.trainable = False

optimizer = keras.optimizers.Adam(learning_rate=0.01, decay=0.01)
model_1.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model_1.fit(train_ds_1,
                    validation_data=validation_ds_1,
                    class_weight=class_weights,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [62]:
# train all the layers together for a bit with a much lower learning rate

for layer in base_model_1.layers[-20:]:
    if not isinstance(layer, tf.keras.layers.BatchNormalization):
        layer.trainable = True

optimizer = keras.optimizers.Adam(learning_rate=0.0005, decay=0.001)
model_1.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model_1.fit(train_ds_1,
                    validation_data=validation_ds_1,
                    class_weight=class_weights,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [63]:
# save the model
model_1.save('saved_models/model_VGG') # change this path to save a new version

2022-04-22 17:36:09.807271: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: saved_models/model_VGG/assets
