# Training notebook

* This notebook is used for building, training and evaluating CNN models on MURA dataset.  
* For the purpose of experimentation, the CNN model, dataset type (original / preprocessed offline), online augmentation and preprocessing can be easily changed with the use of custom loading functions
* More information on used functions can be found mainly in python files ***utils.py*** and ***image_preprocessing.py***
* Some cells are specific for Google Colab environment and might not work correctly in other environments

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import pandas as pd
import pickle
import numpy as np
import cv2
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt
%matplotlib inline

# Google Colab does not have tensorflow_addons installed by default
!pip install tensorflow-addons
from tensorflow_addons.metrics import CohenKappa

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 28.1 MB/s eta 0:00:01[K     |▋                               | 20 kB 19.8 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.6 MB/s eta 0:00:01[K     |█▏                              | 40 kB 7.8 MB/s eta 0:00:01[K     |█▌                              | 51 kB 6.5 MB/s eta 0:00:01[K     |█▊                              | 61 kB 7.7 MB/s eta 0:00:01[K     |██                              | 71 kB 8.4 MB/s eta 0:00:01[K     |██▍                             | 81 kB 7.0 MB/s eta 0:00:01[K     |██▋                             | 92 kB 7.8 MB/s eta 0:00:01[K     |███                             | 102 kB 8.6 MB/s eta 0:00:01[K     |███▏                            | 112 kB 8.6 MB/s eta 0:00:01[K     |███▌                            | 122 kB 8.6 MB/s eta 0:00:01[K     |███

# FILEPATHS


In [2]:
DRIVE_DIR = '/content/drive/MyDrive/MURA/' # Directory in my personal Google Drive with all source files and datasets
MURA_DIR = '/content/original/' # Directory with original MURA dataset
CLAHE_2_DIR = '/content/clahe_2/' # Directory for CLAHE preprocessed dataset with clipLimit=2
CLAHE_10_DIR = '/content/clahe_10/' # Directory for CLAHE preprocessed dataset with clipLimit=10
CROPPED_DIR = '/content/cropped/' # Directory for custom cropping preprocessed dataset
DATAFRAME_PATH = DRIVE_DIR + '/tvt_detailed_paths.csv' # Path to csv file with dataset information (train-valid-test split)
CHECKPOINT_DIR = '/content/drive/MyDrive/MURA/models/' # Root directory for storing models

# Google Colab specific section

## Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Google colab functions

Next cell containts function definitions, that are needed or useful when running notebook in Google Colab, such as copying dataset from Google Drive, checking GPU utilization, copying other files from Google Drive.

In [4]:
# This function is from Google Colab guide notebooks: https://colab.research.google.com/notebooks/pro.ipynb
def check_gpu():
  """
  Prints information about available GPU or message saying there isn't one
  """
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)


def init_file_struct():
  """
  Prep file structure in Google Colab and copies neccessary files from Google Drive
  """
  !cp "{DRIVE_DIR}src/utils.py" .
  !cp "{DRIVE_DIR}src/image_preprocessing.py" .


def copy_dataset(filename):
  """
  Copies and unzips dataset from Google Drive and deletes the zipped file

  Parameters
  ----------
  filename : str
    Dataset filename, without its path, the path will be added from constants
  """
  !cp {DRIVE_DIR}{filename} /content/

  !unzip -q /content/{filename}

  !rm /content/{filename}

## Import python files after copying them from Google Colab

In [5]:
init_file_struct()
from utils import *
from image_preprocessing import *

## Copy desired dataset to be used

In [6]:
copy_dataset('original.zip')
copy_dataset('clahe_2.zip')
# copy_dataset('clahe_10.zip')
# copy_dataset('cropped.zip')

# SETUP

In [None]:
# Functions used to preprocess images according to ImageNet 
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet50_preprocess
from tensorflow.keras.applications.densenet import preprocess_input as densenet_preprocess

In [7]:
# DATAFRAME
BODY_PART = 'SHOULDER' # Which body part to use, can be "ALL" for full dataset
# IMAGE AUGMENTATION
ROTATION_R = 20 # Rotation range
W_SHIFT_R = 0.05 # Width shift range
H_SHIFT_R = 0.05 # Height shift range
BRIGHTNESS_R = (0.9, 1.1) # Brightness range
ZOOM_R = 0.1 # Zoom range
H_FLIP = True # Flip the image horizontally
PRE_FUNC = rescale # Preprocessing function to be applied to every image before feeding it to the model
# DATAFRAME FLOW
FLOW_DIR = MURA_DIR # Directory containing dataset images, choose one from filepath constants or use your own
BATCH_SIZE = 32 # Number of images per batch
IMAGE_SIZE = (224, 224) # Resize all images to this size
# MODEL
BASE_NAME = 'DenseNet169' # Name corresponding to one of the architectures from Keras functional API
WEIGHTS = 'imagenet' # ImageNet pre-trained weights or a path to stored model weights
INPUT_SHAPE = (224, 224, 3) # Model input shape
POOLING = 'avg' # Pooling layer used after the last convolutional layer, can be None
OPTIMIZER = Adam(learning_rate=0.0001) # Optimzer used during training
MODEL_NAME = BASE_NAME + '_tmp' # Name used for storing model weights during and after training
# TRAINING
EPOCHS = 10 # Number of epochs for trainig

# Additional preprocessing functions

# Create datagens and flows

In [8]:
# Create dataframes from train valid split
train_df = get_dataframe(body_part=BODY_PART, split='train', path=DATAFRAME_PATH)
valid_df = get_dataframe(body_part=BODY_PART, split='valid', path=DATAFRAME_PATH)

# Create ImageDataGenerators, train_gen uses specified online augmentation, valid_gen only preprocesses images
train_gen, valid_gen = create_generators(rotation_r=ROTATION_R,
                                         w_shift_r=W_SHIFT_R,
                                         h_shift_r=H_SHIFT_R,
                                         brightness_r=BRIGHTNESS_R,
                                         zoom_r=ZOOM_R,
                                         h_flip=H_FLIP,
                                         pre_func=PRE_FUNC)
# Create dataframe flows, filepaths in dataframes are not absolute, so set directory parameter correctly
train_flow, valid_flow = create_dataframe_flows(train_gen=train_gen,
                                                valid_gen=valid_gen,
                                                train_df=train_df,
                                                valid_df=valid_df,
                                                directory=FLOW_DIR,
                                                img_size=IMAGE_SIZE,
                                                batch_size=BATCH_SIZE)

Found 8257 validated image filenames belonging to 2 classes.
Found 563 validated image filenames belonging to 2 classes.


# Build and compile model

In [9]:
# Build and compile model
model = build_model(base_name=BASE_NAME,
                    weights=WEIGHTS, 
                    shape=INPUT_SHAPE,
                    pooling=POOLING,
                    optimizer=OPTIMIZER,
                    name=MODEL_NAME,
                    add_top=False,
                    metrics=[CohenKappa(num_classes=2, name='kappa')])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5


# Training

In [None]:
# Make checkpoint of weights from best epoch
best_path = CHECKPOINT_DIR + MODEL_NAME + '_best.h5'
checkpoint_best = ModelCheckpoint(filepath=best_path,
                                  save_weights_only=True,
                                  monitor='val_kappa',
                                  mode='max',
                                  save_best_only=True,
                                  verbose=1)

# Optional - make checkpoint of weights every epoch in case Google Colab runtime disconnects
# last_path = CHECKPOINT_DIR + MODEL_NAME + '_last.h5'
# checkpoint_last = ModelCheckpoint(filepath=last_path,
#                                   save_weights_only=True,
#                                   verbose=0)

# Early stopping
early_stop = EarlyStopping(monitor='val_kappa',
                           mode='max',
                           min_delta=0,
                           patience=5,
                           verbose=1,
                           restore_best_weights=True)

hist = model.fit(x=train_flow,
                 epochs=EPOCHS,
                 validation_data=valid_flow,
                 class_weight=get_class_weights(train_df),
                 verbose=1,
                 callbacks=[checkpoint_best, early_stop])

## Save last epoch weights and training history

In [57]:
model.save_weights(CHECKPOINT_DIR + MODEL_NAME + '_last.h5')

with open(CHECKPOINT_DIR + MODEL_NAME + '_hist', 'wb') as file_pi:
        pickle.dump(hist.history, file_pi)

In [None]:
# You can load one of saved training histories to visualize it
# history = pickle.load(open('/content/drive/MyDrive/MURA/models/ResNet50/ResNet50_avg_imagenet_eq_hist_hist', "rb"))

plt.plot(history['kappa'], label='train_kappa')
plt.plot(history['val_kappa'], label = 'val_kappa')
plt.xlabel('Epoch')
plt.ylabel('')
plt.legend(loc='right')
plt.xticks(np.arange(10))

plt.show()

# Evaluation
Evaluate trained model on test set, you can also load weights from previous runs to be evaluated

In [None]:
# Get test set
test_df = get_dataframe(body_part=BODY_PART, split='test', path=DATAFRAME_PATH)

# Create dataframe flow for testing set (without augmentation)
test_flow = valid_gen.flow_from_dataframe(dataframe=test_df,
                                          directory=FLOW_DIR,
                                          x_col='filepath',
                                          y_col='label',
                                          class_mode='binary',
                                          target_size=IMAGE_SIZE,
                                          batch_size=BATCH_SIZE,
                                          seed=27,
                                          shuffle=False) # Dont shuffle so we can pair images with their filepaths

# Optional load weights from previous training
# model.load_weights('/content/drive/MyDrive/MURA/models/DenseNet169_tmp_best.h5')
model.evaluate(x=test_flow, verbose=1)