In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

In [None]:
#import pcamlib to Google Colab
import imp 
pcamlib = imp.new_module('pcamlib')
exec(open("./pcamlib.py").read(), pcamlib.__dict__)

In [None]:
# import pcamlib as pc

In [None]:
# Import basic data science packages
import numpy as np
import pandas as pd

# Import plotting packages
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Import tensorflow packages
import tensorflow as tf
import tensorflow_datasets as tfds

# Import various keras tools
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Import tools for model evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,\
roc_curve, roc_auc_score, classification_report, precision_score, recall_score

To get started with this dataset, I adapted the code from this [article](https://geertlitjens.nl/post/getting-started-with-camelyon/) written by Geert Litjens, one of the authors of the dataset.

I used his code for the `train_pipeline`, `valid_pipeline`, and `test_pipeline`, which load the train, validation, and test sets and prepare them for modelling. I also make use of his function `convert_sample`. This function extracts each image and its corresponding label from the dataset, converts each image to a TensorFlow `tf.float32` datatype, then performs one-hot encoding on the labels and converts them to `tf.float32` as well.

In [None]:
pcam, pcam_info = pc.load_pcam()

In [None]:
train_pipeline, valid_pipeline, test_pipeline = pc.build_pipelines(pcam)

I also used Geert Litjens CNN layer architecture as a starting point. It resembles a VGG16 architecture because it has three sets of two Convolutional layers followed by a single Max Pooling layer, followed by a Flattening layer and two Dense layers before the final Dense layer which outputs the class predictions. I kept the layer parameters the same as his example.

I changed the optimizer to `Adam` from `SGD` simply because he provided multiple hyperparameters to go along with it, and I wanted to experiment with that on my own. I also added additional Dropout layers after each convolutional layer, because the first iteration of the model started overfitting quickly after the first epoch and the validation accuracy didn't improve beyond 80%.

In [None]:
# Instantiate model object
cnn = Sequential()

# Images are 96x96 px, in RGB so there are 3 channels
image_shape = (96, 96, 3)

# Adding convultional layers to the model 
# It was important to add dropout layers after each convolutional layer to reduce overfitting
cnn.add(Conv2D(16, kernel_size=(3, 3), activation='relu', padding='valid', input_shape=image_shape))
cnn.add(Dropout(0.2))
cnn.add(Conv2D(16, kernel_size=(3, 3), activation='relu', padding='valid'))
cnn.add(Dropout(0.2))

# Add a max pool layer to reduce the dimensions of the feature maps
cnn.add(MaxPool2D(pool_size=(2, 2), strides=(2,2)))

# Repeating this architecture two more times
cnn.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='valid'))
cnn.add(Dropout(0.2))
cnn.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='valid'))
cnn.add(Dropout(0.2))
cnn.add(MaxPool2D(pool_size=(2, 2), strides=(2,2)))
     
cnn.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='valid'))
cnn.add(Dropout(0.2))
cnn.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='valid'))
cnn.add(Dropout(0.2))
cnn.add(MaxPool2D(pool_size=(2, 2), strides=(2,2)))

# Flatten the data to prepare for dense layers
cnn.add(Flatten())
        
cnn.add(Dense(256, activation='relu'))
cnn.add(Dropout(0.2))

cnn.add(Dense(128, activation='relu'))
cnn.add(Dropout(0.2))

# Final Dense layer to make class predictions
cnn.add(Dense(2, activation='softmax'))
        
cnn.summary()

In [None]:
# For comparison, this commented line is the original optimizer used in the article:
# sgd_opt = SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=True)
cnn.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Add early stop callback to prevent the model from overfitting, or running too long
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0.01, patience=3, verbose=1)

In [None]:
%%time
history = cnn.fit(train_pipeline,
                   validation_data=valid_pipeline,
                   verbose=1, epochs=15, steps_per_epoch=4096, validation_steps=256,
                   callbacks=[early_stop])

In [None]:
# Save the fitted model to a file
cnn.save('cnn1')

In [None]:
# Save the history of the model to a csv
pc.save_history(history, 'data/models/history/cnn1_history.csv')

In [None]:
# Uncomment the line below to load the model from file if necessary
cnn = tf.keras.models.load_model("cnn1")
hist_df = pc.load_history('data/models/history/cnn1_history.csv')

In [None]:
pc.plot_history(hist_df, title='CNN1')

In [None]:
%%time
pc.print_test_accuracy(cnn, test_pipeline)

In [None]:
%%time
y_proba = pc.generate_y_proba(cnn, test_pipeline, class_1=False, save=True, filepath='data/y_proba/cnn1_y_proba.csv')

In [None]:
# Uncomment to load y_proba from file if not running the model
# y_proba = pc.load_y_proba('data/y_proba/cnn1_y_proba.csv')
# y_proba

In [None]:
%%time
y_pred = pc.generate_y_pred(y_proba)

In [None]:
%%time
y_true = pc.generate_y_true(pcam)

In [None]:
pc.plot_cf_matrix(y_true, y_pred, normalize=True)

In [None]:
pc.print_classification_report(y_true, y_pred)

In [None]:
pc.plot_roc_curve(y_true, y_proba)

In [None]:
pc.plot_misclassified_images(pcam, y_true, y_pred)