# Fine Tuning a pre-trained Deep CNN on a GPU machine

This session is inspired by [a blog post](https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html) by FranÃ§ois Chollet, the creator of the Keras library.

**WARNING**: the execution of notebook **requires a GPU** e.g. with **at least 6GB of GPU RAM**.

**macOS users**: If you have an Apple Silicon Mac (M1/M2/M3/M4), you can use MPS (Metal Performance Shaders) for GPU acceleration. MPS typically provides 2-5x speedup compared to CPU. The code below will automatically detect and use the best available device.

For this session we are going to use a cats vs dogs image classification dataset.

## Running on Kaggle (recommended for GPU access)

It is recommended to do this notebook from the [kaggle kernels](https://www.kaggle.com/kernels) hosted interface that provides GPU hours for free:

- login at [kaggle kernels](https://www.kaggle.com/kernels);
- click the **new notebook** button;
- upload this notebook file from the "File" menu;
- in the "File" menu "Add or upload data" and choose to add the Dogs vs. Cats dataset;
- the data should be available in the `/kaggle/input/dogs-vs-cats` folder of your kaggle kernel session;
- enable "Internet" and "GPU" in the "Settings" panel of this kernel.

## Running locally

To download the data locally, install the Kaggle CLI and download the Microsoft Cats vs Dogs dataset:

```bash
pip install kaggle

# Create and configure your API key from https://www.kaggle.com/settings
# Save it to ~/.kaggle/kaggle.json

# Download the dataset
mkdir -p ~/data/dogs-vs-cats
cd ~/data/dogs-vs-cats
kaggle datasets download -d shaunthesheep/microsoft-catsvsdogs-dataset
unzip microsoft-catsvsdogs-dataset.zip

# Reorganize into expected structure
mkdir -p train
mv PetImages/Cat train/cat
mv PetImages/Dog train/dog
rmdir PetImages
```

Once this is done we can proceed with loading the data:

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"

import torch

# Detect best available device
if torch.cuda.is_available():
    device = torch.device("cuda")
    device_name = f"CUDA ({torch.cuda.get_device_name(0)})"
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    device_name = "MPS (Apple Metal)"
else:
    device = torch.device("cpu")
    device_name = "CPU"

print(f"Using device: {device_name}")
print(f"PyTorch version: {torch.__version__}")

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import keras

import os.path as op
import shutil
from zipfile import ZipFile

In [None]:
# When working from manually downloaded files (default for local execution):
data_folder = op.expanduser('~/data/dogs-vs-cats')
working_folder = data_folder

# When running on Kaggle, uncomment these instead:
# data_folder = '/kaggle/input/dogs-vs-cats'
# working_folder = "/kaggle/working"

In [None]:
# The train folder should already exist with cat/ and dog/ subfolders
# If using the original Kaggle competition data, you may need to extract train.zip first:
# train_zip = op.join(data_folder, 'train.zip')
# if not op.exists(train_folder) and op.exists(train_zip):
#     print('Extracting %s...' % train_zip)
#     ZipFile(train_zip).extractall(working_folder)

In [None]:
train_folder = op.join(working_folder, 'train')

# Verify train folder exists
if op.exists(train_folder):
    print(f"Train folder found: {train_folder}")
    print(f"Contents: {os.listdir(train_folder)}")
else:
    print(f"WARNING: Train folder not found at {train_folder}")

The Keras image data helpers want images for different classes ('cat' and 'dog') to live in distinct subfolders. Let's rearrange the image files to follow that convention:

In [None]:
def rearrange_folders(folder):
    image_filenames = [op.join(folder, fn) for fn in os.listdir(folder)
                       if fn.endswith('.jpg')]
    if len(image_filenames) == 0:
        return
    print("Rearranging %d images in %s into one subfolder per class..."
          % (len(image_filenames), folder))
    for image_filename in image_filenames:
        subfolder, _ = image_filename.split('.', 1)
        subfolder = op.join(folder, subfolder)
        if not op.exists(subfolder):
            os.mkdir(subfolder)
        shutil.move(image_filename, subfolder)

rearrange_folders(train_folder)

Lets build a validation dataset by taking 500 images of cats and 500 images of dogs out of the training set:

In [None]:
n_validation = 500

validation_folder = op.join(working_folder, 'validation')
if not op.exists(validation_folder):
    os.mkdir(validation_folder)
    for class_name in ['dog', 'cat']:
        train_subfolder = op.join(train_folder, class_name)
        validation_subfolder = op.join(validation_folder, class_name)
        print("Populating %s..." % validation_subfolder)
        os.mkdir(validation_subfolder)
        images_filenames = sorted(os.listdir(train_subfolder))
        for image_filename in images_filenames[-n_validation:]:
            shutil.move(op.join(train_subfolder, image_filename),
                        validation_subfolder)
        print("Moved %d images" % len(os.listdir(validation_subfolder)))

## Data Loading and Data Augmentation


Let's use keras utilities to manually load the first image file of the cat folder. If keras complains about the missing "PIL" library, make sure to install it with one of the following commands:

```bash
conda install pillow

# or

pip install pillow
```

You might need to restart the kernel of this notebook to get Keras work.

In [None]:
from keras.utils import array_to_img, img_to_array, load_img

# Load a sample cat image
sample_images = os.listdir(op.join(train_folder, 'cat'))
sample_image = sample_images[0]
img = load_img(op.join(train_folder, 'cat', sample_image))
x = img_to_array(img)

print(f"Loaded: {sample_image}")
print(x.shape)

In [None]:
plt.imshow(x.astype(np.uint8))
plt.axis('off');

Keras provides tools to generate many variations from a single image: this is useful to augment the dataset with variants that should not affect the image label: a rotated image of a cat is an image of a cat.

Doing data augmentation at train time make neural networks ignore such label-preserving transformations and therefore help reduce overfitting.

In [None]:
from keras import layers

# Data augmentation using Keras 3 preprocessing layers
data_augmentation = keras.Sequential([
    layers.RandomRotation(0.1),  # ~40 degrees = 0.1 * 360
    layers.RandomTranslation(0.2, 0.2),
    layers.RandomZoom(0.2),
    layers.RandomFlip("horizontal"),
], name="data_augmentation")

# For visualization, let's augment a single image
x_batch = np.expand_dims(x / 255.0, axis=0)  # Add batch dimension and normalize
plt.figure(figsize=(11, 5))
for i in range(15):
    x_augmented = data_augmentation(x_batch, training=True)
    plt.subplot(3, 5, i + 1)
    plt.imshow(x_augmented[0])
    plt.axis('off')

In [None]:
# Cell removed - augmentation visualization moved to previous cell

In Keras 3, we use `image_dataset_from_directory` to load images and Keras preprocessing layers for augmentation. The preprocessing layers can be integrated directly into the model.

In [None]:
# Load and display augmented images from directory using Keras 3 API
train_ds = keras.utils.image_dataset_from_directory(
    train_folder,
    image_size=(224, 224),
    batch_size=1,
    label_mode='binary',
)

plt.figure(figsize=(11, 5))
for i, (images, labels) in enumerate(train_ds.take(15)):
    augmented = data_augmentation(images, training=True)
    plt.subplot(3, 5, i + 1)
    plt.imshow(augmented[0])
    plt.axis('off')

## Loading a pre-trained computer vision model

Let us load a state of the art model with a good tradeoff between prediction speed, model size and predictive accuracy, namely a Residual Network with 54 parameterized layers (53 convolutional + 1 fully connected for the softmax):

In [None]:
from keras.applications.resnet50 import ResNet50, preprocess_input

full_imagenet_model = ResNet50(weights='imagenet')

In [None]:
print(full_imagenet_model.summary())

If you have `graphviz` system package and the `pydot_ng` python package installed you can uncomment the following cell to display the structure of the network.

In [None]:
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# model_viz = model_to_dot(full_imagenet_model,
#                          show_layer_names=False,
#                          show_shapes=True)
# SVG(model_viz.create(prog='dot', format='svg'))

## Transfer learning

Let's remove the last dense classification layer that is specific to the image net classes and use the previous layer (after flattening) as a feature extractor:

In [None]:
from keras.models import Model

output = full_imagenet_model.layers[-2].output
base_model = Model(full_imagenet_model.input, output)

When using this model we need to be careful to apply the same image processing as was used during the training, otherwise the marginal distribution of the input pixels might not be on the right scale:

In [None]:
# In Keras 3, preprocess_input handles batched inputs directly
# We can use it in dataset.map() pipelines

In [None]:
batch_size = 50

train_ds = keras.utils.image_dataset_from_directory(
    train_folder,
    image_size=(224, 224),
    batch_size=batch_size,
    label_mode='binary',
    shuffle=True,
)

# Apply preprocessing (ResNet expects specific normalization)
train_ds_preprocessed = train_ds.map(
    lambda x, y: (preprocess_input(x), y)
)

# Get a sample batch
for X, y in train_ds_preprocessed.take(1):
    print(X.shape, y.shape)

**Exercise**: write a function that iterate of over 5000 images in the training set (bach after batch), extracts the activations of the last layer of `base_model` (by calling predicts) and collect the results in a big numpy array with dimensions `(5000, 2048)` for the features and `(5000,)` for the matching image labels.

In [None]:
# %load solutions/dogs_vs_cats_extract_features.py

Let's load precomputed features if available:

In [None]:
print("Loading precomputed features")
labels_train = np.load('labels_train.npy')
features_train = np.load('features_train.npy')

Let's train a simple linear model on those features. First let's check that the resulting small dataset has balanced classes:

In [None]:
print(labels_train.shape)

In [None]:
np.mean(labels_train)

In [None]:
n_samples, n_features = features_train.shape
print(n_features, "features extracted")

Let's define the classification model:

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam


top_model = Sequential()
top_model.add(Dense(1, input_dim=n_features, activation='sigmoid'))
top_model.compile(optimizer=Adam(learning_rate=1e-4),
                  loss='binary_crossentropy', metrics=['accuracy'])

top_model.fit(features_train, labels_train,
              validation_split=0.1, verbose=2, epochs=15)

Alright so the transfer learning is already at ~0.98 / 0.99 accuracy. This is not too surprising as the cats and dogs classes are already part of the imagenet label set.

Note that this is **already as good or slightly better than the winner of the original kaggle competition** [three years ago](https://www.kaggle.com/c/dogs-vs-cats/leaderboard). At that time they did not have pretrained resnet models at hand.

Or validation set has 1000 images, so an accuracy of 0.990 means only 10 classification errors.

Let's plug this on top the base model to be able to use it to make some classifications on our held out validation image folder:

In [None]:
model = Model(base_model.input, top_model(base_model.output))

In [None]:
val_ds = keras.utils.image_dataset_from_directory(
    validation_folder, 
    image_size=(224, 224),
    batch_size=1,
    label_mode='binary',
    shuffle=False,
)

plt.figure(figsize=(12, 8))
for i, (X, y) in enumerate(val_ds.take(15)):
    plt.subplot(3, 5, i + 1)
    plt.imshow(X[0].numpy().astype('uint8'))
    X_preprocessed = preprocess_input(X.numpy())
    prediction = model.predict(X_preprocessed, verbose=0)
    label = "dog" if y[0] > 0.5 else "cat"
    plt.title("dog prob=%0.4f\ntrue label: %s"
              % (prediction[0][0], label))
    plt.axis('off')

Let's compute the validation score on the full validation set:

In [None]:
val_ds_batched = keras.utils.image_dataset_from_directory(
    validation_folder,
    image_size=(224, 224), 
    batch_size=batch_size,
    label_mode='binary',
    shuffle=False,
)

# Apply preprocessing
val_ds_preprocessed = val_ds_batched.map(
    lambda x, y: (preprocess_input(x), y)
)

all_correct = []
for X, y in val_ds_preprocessed:
    predictions = model.predict(X, verbose=0).ravel()
    y_numpy = y.numpy().ravel()
    correct = list((predictions > 0.5) == y_numpy)
    all_correct.extend(correct)
    print("Processed %d images" % len(all_correct))
    
print("Validation accuracy: %0.4f" % np.mean(all_correct))

**Exercise:** display the example where the model makes the most confident mistakes.

To display images in jupyter notebook you can use:

```python
from IPython.display import Image, display
import os.path as op

display(Image(op.join(validation_folder, image_name)))
```

The filenames of items sampled by a flow (without random shuffling) can be accessed via: `val_flow.filenames`.

In [None]:
# %load solutions/dogs_vs_cats_worst_predictions.py

## Fine tuning

Let's identify the location of the residual blocks (merge by addition in a residual architecture):

In [None]:
from keras.layers import Add

[(i, l.output_shape)
 for (i, l) in enumerate(model.layers)
 if isinstance(l, Add)]

Let's fix the weights of the low level layers and fine tune the top level layers:

In [None]:
for i, layer in enumerate(model.layers):
    layer.trainable = i >= 151

Let's fine tune a bit the top level layers to see if we can further improve the accuracy. Use the **nvidia-smi** command in a bash terminal on the server to monitor the GPU usage when the model is training.

In [None]:
from keras import optimizers

# Data augmentation layers for fine-tuning
fine_tune_augmentation = keras.Sequential([
    layers.RandomRotation(0.05),  # ~20 degrees
    layers.RandomTranslation(0.2, 0.2),
    layers.RandomZoom(0.2),
    layers.RandomFlip("horizontal"),
], name="fine_tune_augmentation")

# Create training dataset with augmentation and preprocessing
train_ds_finetune = keras.utils.image_dataset_from_directory(
    train_folder,
    image_size=(224, 224),
    batch_size=batch_size,
    label_mode='binary',
    shuffle=True,
    seed=0,
)

# Apply augmentation then preprocessing
def augment_and_preprocess(x, y):
    x = fine_tune_augmentation(x, training=True)
    x = preprocess_input(x)
    return x, y

train_ds_augmented = train_ds_finetune.map(augment_and_preprocess)

# Validation dataset (no augmentation)
val_ds_finetune = keras.utils.image_dataset_from_directory(
    validation_folder,
    image_size=(224, 224),
    batch_size=batch_size,
    label_mode='binary',
    shuffle=False,
)
val_ds_finetune = val_ds_finetune.map(lambda x, y: (preprocess_input(x), y))

opt = optimizers.SGD(learning_rate=1e-4, momentum=0.9)
model.compile(optimizer=opt, loss='binary_crossentropy',
              metrics=['accuracy'])

# Compute steps per epoch
steps_per_epoch = 5000 // batch_size
validation_steps = 1000 // batch_size  # validation set has ~1000 images

history = model.fit(
    train_ds_augmented,
    steps_per_epoch=steps_per_epoch,
    epochs=30,
    validation_data=val_ds_finetune,
    validation_steps=validation_steps,
)

# Note: the pretrained model was already very good. Fine tuning
# does not really seem to help. It might be more interesting to
# introspect the quality of the labeling in the training set to
# check for images that are too ambiguous and should be removed
# from the training set.

**Bonus exercise**: train your own architecture from scratch using adam and data augmentation. Start with a small architecture first (e.g. 4 convolutions layers interleaved with 2 max pooling layers followed by a `Flatten` and two fully connected layers).