<a href="https://colab.research.google.com/github/issondl/from-data-to-solution-2021/blob/main/3_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model

## Check for acceleration

In [None]:
!nvidia-smi

## Imports

In [None]:
import numpy as np
np.random.seed(2021)
import tensorflow as tf
tf.random.set_seed(2021)
import random
random.seed(2021)

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import math

import matplotlib.pyplot as plt
import pandas as pd
import tensorflow.keras.layers as L
from keras.layers import Dense
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from tensorflow.compat.v1.logging import INFO, set_verbosity
from tqdm import tqdm

from IPython.display import YouTubeVideo

set_verbosity(INFO)

## Constants

In [None]:
DATA_DIR = 'data/'
os.makedirs(DATA_DIR, exist_ok=True)

CSV_FILE = os.path.join(DATA_DIR, 'nih_chest_xray_single_9c_bb_onehot.csv')
IMAGES_ARCHIVE_FILE = os.path.join(DATA_DIR, 'nih_chest_xray_single_9c_256x256.tar.gz')
IMAGES_DIR = os.path.join(DATA_DIR, 'images')

## Download the prepared dataset

In [None]:
if not os.path.exists(CSV_FILE):
    ! gdown --id 1i7oUN9QTjOavTPGgvWKq22InrTFN6mYH -O $CSV_FILE
else:
    print('CSV file ({}) already exists.'.format(CSV_FILE))

In [None]:
if not os.path.exists(IMAGES_ARCHIVE_FILE):
    ! gdown --id 1Cg7dbE1tWSBvdTfGc0G272SA_j_XocOW -O $IMAGES_ARCHIVE_FILE
else:
    print('Images archive file ({}) already exists.'.format(IMAGES_ARCHIVE_FILE))

In [None]:
if not os.path.exists(IMAGES_DIR):
    ! tar -xzf $IMAGES_ARCHIVE_FILE
    print('Unpacked to {}'.format(IMAGES_DIR))
else:
    print('Images have already been unpacked ({}).'.format(IMAGES_DIR))

## Explore the dataset

Tasks:

1. Read the CSV file and explore the data


In [None]:
## Read the CSV file


### Prepare Data for Training and Testing

Tasks:

1. Create new dataframe with only 'Pneumothorax' and 'Mass' labels.
2. Split the two-class dataframe into train, validation and test dataframes with `train_test_split`. Each call will split the data into two, so we need to call it twice to get `train`, `val` and `test` sets.

In [None]:
## Create new dataframe with only 'Pneumothorax' and 'Mass' labels


In [None]:
TEST_SIZE = 0.15
VAL_SIZE = 0.15

In [None]:
## Split the two-class dataframe into train, validation and test dataframes with
## `train_test_split`. Each call will split the data into two, so we need to call
## it twice to get `train`, `val` and `test` sets.


In [None]:
print(len(df))
print(len(train_df))
print(len(val_df))
print(len(test_df))
print()
print(len(train_df)/len(df)*100)
print(len(val_df)/len(df)*100)
print(len(test_df)/len(df)*100)

In [None]:
print(train_df['Finding Labels'].value_counts())
train_df['Finding Labels'].value_counts().plot(kind='bar')

## Create data pipeline

Tasks:

1. Create data pipeline for training
   
   Use `ImageDataGenerator` and `flow_from_dataframe` to build an image pre-processing pipeline that will feed training with structured data.

1. Create data pipeline for testing and validation

  Do the same for testing and validation. You should also compute quantities required for featurewise normalization (like mean and standard deviation) on training data and fit test/validation generators on it.

Useful resources:

* https://keras.io/api/preprocessing/image/
* https://fairyonice.github.io/Learn-about-ImageDataGenerator.html


In [None]:
## Create data pipeline for training

BATCH_SIZE = 32
IMAGE_SIZE = [128, 128]

def get_train_generator(df, labels, batch_size, image_size,
                        color_mode='grayscale', file_path_col='File Path'):
    image_generator = ImageDataGenerator(
        samplewise_center=True,  # Set each sample mean to 0.
        samplewise_std_normalization=True,  # Divide each input by its std
        rotation_range=5,  # Degree range for random rotations
        width_shift_range=0.1,  # fraction of total width
        height_shift_range=0.05,  # fraction of total height
        shear_range=0.1,  # Shear Intensity (Shear angle in counter-clockwise direction in degrees)
        zoom_range=0.15,  # Range for random zoom
        fill_mode='reflect',  # fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}
        horizontal_flip=True,  # Randomly flip inputs horizontally.
        vertical_flip=False,  # Randomly flip inputs vertically.
    )

    generator = image_generator.flow_from_dataframe(
        dataframe=df,  # Pandas dataframe containing the filepaths relative to directory (or absolute paths if directory is None)
        x_col=file_path_col,  # column in dataframe that contains the filenames
        y_col=labels,  # string or list, column/s in dataframe that has the target data.
        class_mode='raw',  # "raw": numpy array of values in y_col column(s)
        batch_size=batch_size,  # size of the batches of data
        shuffle=True,  # whether to shuffle the data (default: True)
        seed=2021,  # optional random seed for shuffling and transformations.
        target_size=image_size,  # tuple of integers (height, width), default: (256, 256). The dimensions to which all images found will be resized.
        color_mode=color_mode,  # one of "grayscale", "rgb", "rgba"
    )
    return generator

train_generator = get_train_generator(
    df=train_df,
    labels=LABELS,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
)

In [None]:
def get_label(label_encoding, labels):
    for i, val in enumerate(label_encoding):
        if val:
            return labels[i]
    return 'No Label'

x, y = train_generator.__getitem__(0)

samples = 2

fig = plt.figure(figsize=(20, 10))
for i in range(samples):
    label = get_label(y[i], LABELS)
    img_processed = x[i].squeeze()

    fig.add_subplot(samples, i+1, 1)
    plt.imshow(img_processed, cmap='gray')
    plt.title('{}'.format(label))
    plt.axis(False)

In [None]:
## Create data pipeline for testing and validation
def get_test_and_valid_generator(val_df, test_df, train_df, labels, batch_size, image_size,
                                 color_mode='grayscale', file_path_col='File Path'):
    raw_train_generator = ImageDataGenerator().flow_from_dataframe(
        dataframe=train_df, 
        x_col=file_path_col,
        y_col=labels, 
        class_mode="raw", 
        batch_size=100,  # sample size, we will be loading only the first batch
        shuffle=True,
        seed=2021,
        target_size=image_size,
        color_mode=color_mode,
    )
    
    # get data sample
    batch = raw_train_generator.next()
    data_sample = batch[0]

    # use sample to fit mean and std for test set generator
    image_generator = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization=True
    )
    
    # fit generator to sample from training data
    image_generator.fit(data_sample)

    # get test generator
    valid_generator = image_generator.flow_from_dataframe(
        dataframe=val_df,
        x_col=file_path_col,
        y_col=labels,
        class_mode="raw",
        batch_size=batch_size,
        shuffle=False,
        target_size=image_size,
        color_mode=color_mode,
    )

    test_generator = image_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col=file_path_col,
        y_col=labels,
        class_mode="raw",
        batch_size=batch_size,
        shuffle=False,
        target_size=image_size,
        color_mode=color_mode,
    )
    return valid_generator, test_generator

val_generator, test_generator= get_test_and_valid_generator(
    val_df=val_df,
    test_df=test_df,
    train_df=train_df,
    labels=LABELS,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
)


## Training

In [None]:
def visualize_training(history):
    plt.figure(figsize=(10,6))
    plt.plot(history.history['accuracy'], label = 'training', marker = '*', linewidth=3)
    plt.plot(history.history['val_accuracy'], label = 'validation', marker = 'o', linewidth=3)
    plt.title('Training Accuracy vs Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(fontsize = 'x-large')
    plt.show()

    plt.figure(figsize=(10,6))
    plt.plot(history.history['loss'], label = 'training', marker = '*', linewidth=3)
    plt.plot(history.history['val_loss'], label = 'validation', marker = 'o', linewidth=3)
    plt.title('Training Loss vs Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(fontsize = 'x-large')
    plt.show()

In [None]:
EPOCHS = 10

### Shallow Model

Tasks:

1. Create a shallow model based on LeNet-5

In [None]:
%%time
##

model_1 = tf.keras.Sequential(
    layers=[
        ## START

        ## END
    ],
    name='Model_Shallow',
)

model_1.compile(
    ## START

    ## END
)

history_1 = model_1.fit(
    ## START

    ## END
)

visualize_training(history_1)


### Regularization

Tasks:
1. try to fix model's poor performance using common regularization techniques - dropout and L1/L2 regularization

Useful resources:

* https://keras.io/api/layers/regularization_layers/dropout/
* https://keras.io/api/layers/regularizers/

In [None]:
%%time
##

model_dropout = tf.keras.Sequential(
    layers = [
        ## START

        ## END
    ],
    name='Model_Dropout',
)

model_dropout.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics = ['accuracy'],
)

history_dropout = model_dropout.fit(
    train_generator,
    validation_data=val_generator,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    epochs=EPOCHS,
)

visualize_training(history_dropout)


In [None]:
%%time

from tensorflow.keras.regularizers import l1_l2 as L1L2
from tensorflow.keras.regularizers import l2 as L2

model_reg = tf.keras.Sequential(
    layers=[
        ## START

        ## END
    ],
    name='Model_Regularization',
)

model_reg.summary()

model_reg.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics = ['accuracy'],
)

history_reg = model_reg.fit(
    train_generator,
    validation_data=val_generator,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    epochs=EPOCHS,
)

visualize_training(history_reg)


### Early Stopping

Tasks:

1. Try to reduce training time by using early stopping once model reaches satisfactory performance

Useful resources:

* https://keras.io/api/callbacks/early_stopping/

In [None]:
%%time
from tensorflow.keras.callbacks import EarlyStopping

model_earlystop = tf.keras.Sequential(
    layers=[
        L.Conv2D(filters=32, kernel_size=(5, 5), activation='relu', input_shape=(*IMAGE_SIZE, 1)),
        L.MaxPool2D(),
        L.Conv2D(filters=48, kernel_size=(5, 5), activation='relu'),
        L.MaxPool2D(),
        L.Flatten(),
        L.Dense(units=256, activation='relu',
                kernel_regularizer=L1L2(l1=1e-5, l2=1e-4),
                activity_regularizer=L2(1e-5)
        ),
        L.Dropout(0.2),
        L.Dense(units=84, activation='relu',
                kernel_regularizer=L1L2(l1=1e-5, l2=1e-4),
                activity_regularizer=L2(1e-5)
        ),
        L.Dropout(0.2),
        L.Dense(len(LABELS), activation='softmax'),
    ],
    name='Model_EarlyStop'
)

model_earlystop.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy'],
)

## START
early_stopping_custom =

## END

history_earlystop = model_earlystop.fit(
    train_generator,
    validation_data=val_generator,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    epochs=EPOCHS,
    ## START
    
    ## END
)

visualize_training(history_earlystop)


### Transfer Learning

Tasks:

1. To furtherly improve model's performance change the approach - instead of implementing your own model and training from scratch use existing model with pre-trained weights.

In [None]:
!pip install -q efficientnet

In [None]:
import efficientnet.tfkeras as efn

In [None]:
IMAGE_SIZE = [240, 240]  # https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/

In [None]:
model_transfer = tf.keras.Sequential(
    layers=[
        ## START

        ## END
    ],
    name='Model_TransferLearning',
)

In [None]:
model_transfer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy'],
)

In [None]:
train_generator = get_train_generator(
    df = train_df,
    file_path_col="File Path",
    labels=LABELS,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    color_mode='rgb',
)

val_generator, test_generator= get_test_and_valid_generator(
    val_df=val_df,
    test_df=test_df,
    train_df=train_df,
    file_path_col="File Path",
    labels=LABELS,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    color_mode='rgb',
)

In [None]:
#@title
YouTubeVideo('QIlR0rX3Zx8')

In [None]:
history = model_transfer.fit(
    train_generator,
    validation_data=val_generator,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    epochs=10,
    callbacks=[early_stopping_custom],
)

visualize_training(history)