In [1]:
import sys
import sklearn
import os
import shutil
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import random as python_random
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)

# RESNET Round 3, Part 2
This model has already been trained for x-ray data, now we retrain on data that has been upsapmpled (for small classes) and downsampled (for large classes) to balance classes.

I've decided to get 5,000 records of each

## Upsample the data

In [38]:
data_path = "../dl_data/"
class_names = os.listdir(data_path)
class_dist = {} # get the originial distribution of each class
f_names = {} # get list of file paths per class
for c in class_names:
    class_dist[c] = len(os.listdir(data_path + c))
    f_names[c] = os.listdir(data_path + c)
class_dist

{'Covid_img': 3249, 'Viral_img': 1211, 'Normal_img': 9174}

In [39]:
f_names['Covid_img'][:10]

['COVID-2402.png',
 'COVID-1270.png',
 'COVID-3070.png',
 'COVID-2019.png',
 'COVID-2463.png',
 'COVID-396.png',
 'COVID-3605.png',
 'COVID-256.png',
 'COVID-1215.png',
 'COVID-1649.png']

In [40]:
np.random.choice(f_names['Covid_img'], size=20, replace=True, p=None)

array(['COVID-1113.png', 'COVID-2058.png', 'COVID-3289.png',
       'COVID-564.png', 'COVID-1134.png', 'COVID-3547.png',
       'COVID-3072.png', 'COVID-348.png', 'COVID-47.png',
       'COVID-1730.png', 'COVID-538.png', 'COVID-2400.png',
       'COVID-1660.png', 'COVID-1077.png', 'COVID-282.png',
       'COVID-2015.png', 'COVID-1445.png', 'COVID-2920.png',
       'COVID-189.png', 'COVID-308.png'], dtype='<U14')

In [41]:
import math
import random
# split train and test data

test_names = {}
train_names = {}
for c in class_names:
    length = len(f_names[c])
    samp_len = math.floor(length*.20)
    test = random.sample(f_names[c],samp_len)
    
    test_names[c] = test
    train_names[c] = [x for x in f_names[c] if x not in test]

In [42]:
# get samples
sample_paths_test = {}
for c in test_names:
    sample_paths_test[c] = np.random.choice(test_names[c], size=1000, replace=True, p=None)
sample_paths_test

sample_paths_train = {}
for c in train_names:
    sample_paths_train[c] = np.random.choice(train_names[c], size=5000, replace=True, p=None)
sample_paths_train

{'Covid_img': array(['COVID-3271.png', 'COVID-290.png', 'COVID-423.png', ...,
        'COVID-1258.png', 'COVID-837.png', 'COVID-2096.png'], dtype='<U14'),
 'Viral_img': array(['Viral Pneumonia-86.png', 'Viral Pneumonia-688.png',
        'Viral Pneumonia-584.png', ..., 'Viral Pneumonia-1200.png',
        'Viral Pneumonia-862.png', 'Viral Pneumonia-1210.png'],
       dtype='<U24'),
 'Normal_img': array(['Normal-8991.png', 'Normal-3846.png', 'Normal-3714.png', ...,
        'Normal-4941.png', 'Normal-1147.png', 'Normal-5964.png'],
       dtype='<U18')}

In [43]:
for c in sample_paths_test:
    print(len(sample_paths_test[c]))

1000
1000
1000


In [44]:
for c in sample_paths_train:
    print(len(sample_paths_train[c]))

5000
5000
5000


In [45]:
# if you need to, use this to make a new directory

# os.mkdir('../sample_data_train')
# for c in class_names:
#     os.mkdir('../sample_data_train/' + c)
# os.mkdir('../sample_data_test')
# for c in class_names:
#     os.mkdir('../sample_data_test/' + c)

In [46]:
# copy sampled files over
new_path_train = '../sample_data_train/'

for c in sample_paths_train:
    print(data_path + c)
    for i, p in enumerate(sample_paths_train[c]):
        shutil.copyfile(data_path + c + '/' + p, new_path_train + c + '/' + str(i) + '_' + p)

new_path_test = '../sample_data_test/'

for c in sample_paths_test:
    print(data_path + c)
    for i, p in enumerate(sample_paths_test[c]):
        shutil.copyfile(data_path + c + '/' + p, new_path_test + c + '/' + str(i) + '_' + p)

../dl_data/Covid_img
../dl_data/Viral_img
../dl_data/Normal_img
../dl_data/Covid_img
../dl_data/Viral_img
../dl_data/Normal_img


In [47]:
# Check that the new data has arrived
new_class_dist_train = {} # get the originial distribution of each class
for c in class_names:
    new_class_dist_train[c] = len(os.listdir(new_path_train + c))
print(new_class_dist_train)

new_class_dist_test = {} # get the originial distribution of each class
for c in class_names:
    new_class_dist_test[c] = len(os.listdir(new_path_test + c))
print(new_class_dist_test)

{'Covid_img': 5000, 'Viral_img': 5000, 'Normal_img': 5000}
{'Covid_img': 1000, 'Viral_img': 1000, 'Normal_img': 1000}


In [34]:
# # if you need to start over
# for c in class_names:
#     all_files = os.listdir(new_path_train + c)
#     for f in all_files:
#         os.remove(new_path_train + c+ '/' + f)
        
# # if you need to start over
# for c in class_names:
#     all_files = os.listdir(new_path_test + c)
#     for f in all_files:
#         os.remove(new_path_test + c+ '/' + f)

In [30]:
# if for some reason there are checkpoint files in there

# for i in os.listdir('../dl_data/Covid_img/.ipynb_checkpoints'):
#     os.remove('../dl_data/Covid_img/.ipynb_checkpoints/'+i)

# os.listdir('../dl_data/Covid_img/.ipynb_checkpoints')

[]

In [32]:
# os.rmdir('../dl_data/Covid_img/.ipynb_checkpoints')

## Load the data

In [48]:
from sklearn.datasets import load_files 
from keras.utils import np_utils

from keras.preprocessing import image



# directories
data_dir_test = "../sample_data_test/"
data_dir_train = "../sample_data_train/"

#### save out augmented data for visualization

# ## first delete any existing files
# aug_dir = '../augmented_data'
# aug_files = os.listdir(aug_dir)
# for f in aug_files:
#     os.remove(aug_dir + '/' + f)

    
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 224; 
img_width = 224;
    
    
# Train Dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir_train,
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# Test Dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir_test,
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)


Found 15000 files belonging to 3 classes.


2022-05-02 19:25:19.280050: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 19:25:21.083216: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38397 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0


Found 3000 files belonging to 3 classes.


In [51]:
# set checkpoint to resume training if it stops unexpectedly
checkpoint_path = "../checkpoints/training_ROUND3_part2_fixed/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [53]:
# train up the top layer first

model_2 = tf.keras.models.load_model('./saved_models/model_ROUND3')
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# recall = tf.keras.metrics.Recall()
optimizer = keras.optimizers.Adam(learning_rate=0.01, decay=0.01)
model_2.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model_2.fit(train_ds,
                    validation_data=validation_ds,
#                     class_weight=class_weights,
                    epochs=20, callbacks=[callback,cp_callback])


Epoch 1/20


2022-05-02 19:28:23.844293: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-02 19:28:29.086838: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 1: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 2/20
Epoch 2: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 3/20
Epoch 3: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 4/20
Epoch 4: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 5/20
Epoch 5: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 6/20
Epoch 6: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 7/20
Epoch 7: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 8/20
Epoch 8: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 9/20
Epoch 9: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 10/20
Epoch 10: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 11/20
Epoch 11: saving model to ../checkpoints/training_ROUND3_part2_fixed/cp.ckpt
Epoch 12/20
Epoch 12: saving model to ../checkpoin

In [54]:
# history = model_2.fit(train_ds_1,
#                     validation_data=validation_ds_1,
# #                     class_weight=class_weights,
#                     epochs=1, callbacks=[callback,cp_callback])

In [55]:
# save the model
model_2.save('saved_models/model_ROUND3_part2') # change this path to save a new version

2022-05-02 19:39:46.177051: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: saved_models/model_ROUND3_part2/assets
