# Transfer learning using Keras

## Imports and constants

In [1]:
%load_ext autoreload
%autoreload 2
%cd /home/jeremy_damien_guillon/owkin-challenge-data

import owkin.utils as utils
from pathlib import Path
import logging
import numpy as np
import pandas as pd

/home/jeremy_damien_guillon/owkin-challenge-data


In [2]:
# data properties:
HEIGHT = 224 # our images height (in pixels)
WIDTH = 224 # our images width (in pixels)
CHANNELS = 3 # our images channels (i.e. RVB)
DATA_DIR = 'data' # data directory where .csv files are stored
IMAGES_DIR = 'data/train_input/images' # directory where images are stored
TEST_IMAGES_DIR = 'data/test_input/images' # directory where images are stored
BEST_MODEL_FILENAME = 'models/best_transfer_learning.hd5' # output model filename

# deep neural network architecture:
HIDDEN_LAYER_SIZE = 32 # number of units in the last dense layer
OUTPUT_LAYER_SIZE = 2 # number of classes to predict (here "non-tumoral" and "tumoral")
RESNET_POOLING = 'avg' # ['avg' | 'max' ]

# training procedure hyperparameters:
VALIDATION_SPLIT = 0.2 # proportion of images to be reserved for the validation dataset
BATCH_SIZE = 8
NUM_EPOCHS = 10
STEPS_PER_EPOCH_TRAINING = 1000
STEPS_PER_EPOCH_VALIDATION = 10
LOSS_FUNCTION = 'categorical_crossentropy'
METRICS = ['accuracy']

# optimizer hyperparameters:
LEARNING_RATE = 0.01
DECAY = 1e-6
MOMENTUM = 0.9

# others:
SEED = 42 # seed number for reproducibility of the training validation splits

## Model definition

We want to use a pretrained version of ResNet50, on the ImageNet dataset, and fine-tune it in order to classify tumoral from non-tumoral tiles. To do so, we add fully-connected (FC or Dense) layers and train them on our annotated dataset.

In [3]:
from keras.applications import ResNet50
from keras.models import Sequential
from keras.layers import Dense

def build_model():
    model = Sequential([
        ResNet50(weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, CHANNELS), pooling=RESNET_POOLING),
        Dense(HIDDEN_LAYER_SIZE, activation='relu'),
        Dense(OUTPUT_LAYER_SIZE, activation='softmax')
    ])
    
    # we do not train the ResNet50 layer since it is already pre-trained on the ImageNet dataset
    model.layers[0].trainable = False
    
    return model

model = build_model()
model.summary()

Using TensorFlow backend.
Instructions for updating:
Colocations handled automatically by placer.[0m


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Model)             (None, 2048)              23587712  
_________________________________________________________________
dense_1 (Dense)              (None, 32)                65568     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 23,653,346
Trainable params: 65,634
Non-trainable params: 23,587,712
_________________________________________________________________


## Image generator

Since our dataset contains "only" arround 10k images, we might need to *augment* it, that's **data augmentation**. It consists, in our case, in rotating or flipping the tiles to make as if they were different images.

In [4]:
annot_tiles_filename = DATA_DIR + "/train_input" + "/train_tile_annotations.csv"
logging.debug(f'reading csv file: {annot_tiles_filename}')
annot_tiles_df = pd.read_csv(annot_tiles_filename)

Let's visualize the `csv` file content:

In [5]:
annot_tiles_df.head()

Unnamed: 0.1,Unnamed: 0,Target
0,ID_387_annotated_tile_0_15_69_30.jpg,0.0
1,ID_387_annotated_tile_1_15_23_53.jpg,0.0
2,ID_387_annotated_tile_2_15_58_20.jpg,0.0
3,ID_387_annotated_tile_3_15_67_12.jpg,0.0
4,ID_387_annotated_tile_4_15_57_20.jpg,0.0


We have to convert the `Target` column to `str` for the `ImageDataGenerator`.

In [6]:
annot_tiles_df['Target'] = annot_tiles_df['Target'].apply(lambda x: str(x))
# annot_tiles_df['Unnamed: 0'] = annot_tiles_df['Unnamed: 0'].apply(lambda x: x[:6]+'/'+x)

We create an image generator that will load on-the-fly batches of images in memory; raw and/or augmented (i.e. artifical) ones. The dataset will be randomly split in training and validation sets according to the `VALIDATION_SPLIT` parameter.

In [7]:
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator

data_generator =  ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=VALIDATION_SPLIT,
    rotation_range=90,
    horizontal_flip=True,
    vertical_flip=True
)

training_generator = data_generator.flow_from_dataframe(
    subset='training', # set it as the training dataset
    dataframe=annot_tiles_df, x_col='Unnamed: 0', y_col='Target',
    directory=IMAGES_DIR, # images directory where are stored files whose filenames are listed in the `x_col` of the dataframe
    seed=SEED,
    target_size=(HEIGHT, WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical')

validation_generator = data_generator.flow_from_dataframe(
    subset='validation', # set it as the validation dataset
    dataframe=annot_tiles_df, x_col='Unnamed: 0', y_col='Target',
    directory=IMAGES_DIR, # images directory where are stored files whose filenames are listed in the `x_col` of the dataframe
    seed=SEED,
    target_size=(HEIGHT, WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical')

Found 6607 images belonging to 2 classes.
Found 1651 images belonging to 2 classes.


## Model training

### Optimizer and callbacks

In [8]:
import tensorflow as tf
import random, string

# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value
    
def random_id(length=4):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

In [9]:
from keras.optimizers import SGD, Adam

sgd = SGD(lr=LEARNING_RATE, decay=DECAY, momentum=MOMENTUM, nesterov=True)
model.compile(optimizer=sgd, loss=LOSS_FUNCTION, metrics=['accuracy', auc_roc])

from keras.callbacks import ModelCheckpoint, TensorBoard
checkpoint_cb = ModelCheckpoint(filepath=BEST_MODEL_FILENAME, 
                                monitor='val_auc_roc', 
                                save_best_only=True)
tensorboard_cb = TensorBoard(log_dir=f'logs/{random_id()}', 
                             batch_size=BATCH_SIZE, 
                             write_graph=True,
                             update_freq='batch')

Instructions for updating:
Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.[0m
Instructions for updating:
Use tf.cast instead.[0m
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.[0m



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



### Training

In [None]:
model.fit_generator(
    training_generator,
    steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
    epochs = NUM_EPOCHS,
    validation_data=validation_generator,
    validation_steps=STEPS_PER_EPOCH_VALIDATION,
    callbacks=[checkpoint_cb, tensorboard_cb]
)

Instructions for updating:
Use tf.cast instead.[0m


Epoch 1/10

## Test and submission

In [4]:
model.load_weights(BEST_MODEL_FILENAME)

In [6]:
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator

test_data_generator =  ImageDataGenerator(preprocessing_function=preprocess_input)

TEST_IMAGES_DIR = 'data/test_input'
test_generator = test_data_generator.flow_from_directory(TEST_IMAGES_DIR,
                                                    target_size=(HEIGHT, WIDTH),
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    class_mode=None)


Found 51772 images belonging to 2 classes.


In [8]:
y_test = model.predict_generator(test_generator, steps=10, verbose=1)



In [12]:
y_test

array([[9.93449628e-01, 6.55034836e-03],
       [9.98783410e-01, 1.21659075e-03],
       [9.86834586e-01, 1.31654376e-02],
       [9.96193886e-01, 3.80612840e-03],
       [9.89546120e-01, 1.04538472e-02],
       [9.82967377e-01, 1.70326997e-02],
       [9.75464821e-01, 2.45352443e-02],
       [9.79764879e-01, 2.02351399e-02],
       [9.64974940e-01, 3.50250416e-02],
       [9.86985743e-01, 1.30142840e-02],
       [9.67546642e-01, 3.24533433e-02],
       [9.73244905e-01, 2.67551243e-02],
       [9.33049142e-01, 6.69509098e-02],
       [9.99437511e-01, 5.62413363e-04],
       [9.94153798e-01, 5.84619027e-03],
       [9.30909395e-01, 6.90905973e-02],
       [9.73405123e-01, 2.65948549e-02],
       [9.68381107e-01, 3.16189565e-02],
       [9.73255038e-01, 2.67449953e-02],
       [9.80822027e-01, 1.91780049e-02],
       [9.95320141e-01, 4.67987871e-03],
       [9.26325381e-01, 7.36746490e-02],
       [9.55301762e-01, 4.46982682e-02],
       [9.88095284e-01, 1.19047537e-02],
       [9.971699

## What's left to do

- [ ] Optimize, optimize, optimize the hyperparameters !
- [ ] Predict tiles label on test data using a DataFrame to keep track of the subjects IDs
- [ ] Infer weak labels of test subjects
- [ ] Submit the result 