In this kernel I will use a pre-trained Inception V3 as feature extractor of my CNN model and train it further (along with my custom fully-connected NN classifier), to make it able to classify the breed of any dog given in the input image with an high accuracy.

In [None]:
from torchvision import transforms, datasets, models

from torch import optim, cuda, device, max, mean, FloatTensor, save, no_grad, backends, load, autograd
from torch.utils.data import DataLoader, sampler, random_split
import torch.nn as nn

# from PIL import Image
from PIL.Image import open
from numpy import random, zeros
from pandas import DataFrame
from os import listdir, mkdir
from matplotlib.pyplot import figure, plot, subplot, imread, imshow
%matplotlib inline

import xml.etree.ElementTree as ET

cuda.init()
cuda.empty_cache()
cuda.memory_summary(device=None, abbreviated=False)

def set_debug_apis(state: bool = False):
    autograd.profiler.profile(enabled=state)
    autograd.profiler.emit_nvtx(enabled=state)
    autograd.set_detect_anomaly(mode=state)

# Then in training code before the train loop 
set_debug_apis(state=False)

After importing the necessary modules, we will define a set of functions useful for taking (and printing) a sample of data from the dataset and then transforming it to conform to the standard Inception V3 input (pre-trained), also defining the online data increments that I will use during training. 

First of all, I'll print the samples along with their cropped versions.

In [None]:
def crop_image(breed, dog, data_dir):
    tree = ET.parse(data_dir + "annotation/" + breed + "/" + dog)
    bounding_box = tree.getroot().findall("object")[0].find("bndbox")
    xmin = int(bounding_box.find("xmin").text)
    xmax = int(bounding_box.find("xmax").text)
    ymin = int(bounding_box.find("ymin").text)
    ymax = int(bounding_box.find("ymax").text)
    return imread(data_dir + "images/" + breed + "/" + dog + ".jpg")[
        ymin:ymax, xmin:xmax, :
    ]


data_dir = "../data/"
breed_list = listdir(data_dir + "images/")
running_device = device("cuda:0" if cuda.is_available() else "cpu")
print(running_device)


def plot_sample_image_vs_crop(data_dir):
    figure(figsize=(20, 20))
    for i in range(4):
        subplot(421 + (i * 2))
        breed = random.choice(breed_list)
        dog = random.choice(listdir(data_dir + "annotation/" + breed))
        img = imread(data_dir + "images/" + breed + "/" + dog + ".jpg")
        imshow(img)

        tree = ET.parse(data_dir + "annotation/" + breed + "/" + dog)
        boundingBox = tree.getroot().findall("object")[0].find("bndbox")
        xmin = int(boundingBox.find("xmin").text)
        xmax = int(boundingBox.find("xmax").text)
        ymin = int(boundingBox.find("ymin").text)
        ymax = int(boundingBox.find("ymax").text)
        plot([xmin, xmax, xmax, xmin, xmin], [ymin, ymin, ymax, ymax, ymin])
        crop_img = crop_image(breed, dog, data_dir)
        subplot(422 + (i * 2))
        imshow(crop_img)


plot_sample_image_vs_crop(data_dir=data_dir)


Now I'will create a new folder to store the cropped version of the images from the dataset, in order to use them for the training and testing.

In [None]:
def save_image(breed_and_file):
    global data_dir
    img = open("".join([data_dir, "images/", breed_and_file, ".jpg"]))
    tree = ET.parse("".join([data_dir, "annotation/", breed_and_file]))
    boundingBox = tree.getroot().findall("object")[0].find("bndbox")
    xmin = int(boundingBox.find("xmin").text)
    xmax = int(boundingBox.find("xmax").text)
    ymin = int(boundingBox.find("ymin").text)
    ymax = int(boundingBox.find("ymax").text)
    img = img.crop((xmin, ymin, xmax, ymax))
    img = img.convert("RGB")
    img.save("".join([data_dir, "cropped_images/", breed_and_file, ".jpg"]))


def parallel_save_cropped_images(data_dir, breed_list):
    from itertools import chain
    from multiprocessing import Pool
    with Pool() as pool:
        pool.map(
            func=save_image,
            iterable=list(
                chain.from_iterable(
                    [
                        [
                            "".join([breed, "/", file])
                            for file in listdir(
                                "".join([data_dir, "annotation/", breed])
                            )
                        ]
                        for breed in breed_list
                    ]
                )
            ),
            chunksize=50,
        )


if "cropped_images" not in listdir(data_dir):
    mkdir(data_dir + "cropped_images/")
    for breed in breed_list:
        mkdir(data_dir + "cropped_images/" + breed)
    print(
        "Created {} folders to store cropped images of the different breeds.".format(
            len(listdir(data_dir))
        )
    )
    parallel_save_cropped_images(data_dir, breed_list)
    print("Saved cropped images in said folders.")

Now that all cropped images are in place, I'll define the data augmentation needed to improve the generalisation capabilities of my model and the normalisations required by the pre-trained feature extractor (resized to 299 * 299 and normalized as well according to the ImageNet standards).



In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
image_transforms = {
    # Train uses data augmentation
    "train": A.Compose(
        [
            A.RandomRotate90(),
            A.Flip(),
            A.Transpose(),
            A.GaussNoise(p=0.2),
            A.OneOf([
                A.MotionBlur(p=.2),
                A.MedianBlur(blur_limit=3, p=0.1),
                A.Blur(blur_limit=3, p=0.1),
            ], p=0.2),
            A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2),
            A.OneOf([
                A.OpticalDistortion(p=0.3),
                A.GridDistortion(p=.1),
                A.PiecewiseAffine(p=0.3),
            ], p=0.2),
            A.OneOf([
                A.CLAHE(clip_limit=2),
                A.Sharpen(),
                A.Emboss(),
                A.RandomBrightnessContrast(),            
            ], p=0.3),
            A.HueSaturationValue(p=0.3),
            A.CenterCrop(height=299, width=299, always_apply=True), # Image net standards
            A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), # Imagenet standards
            ToTensorV2(),
        ]
    ),
    "test": transforms.Compose(
        [
            transforms.Resize(size=299),
            transforms.CenterCrop(size=299),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    ),
}

Now I will define the batch-size (60 is the maximum size tollarated by my GPU's VMEM before of going into a OOM exception), and then I will prepare the dataloaders for the training and testing, splitting the data in the following manner: 80% training, 10% testing and 10% validation.

In [None]:
batch_size = 60 # Previous working :Values: 16, 32, 40, 50, 58 - BEST 60

all_data = datasets.ImageFolder(root=data_dir + "cropped_images/")
train_data_len = int(len(all_data) * 0.8)
valid_data_len = int((len(all_data) - train_data_len) / 2)
test_data_len = int(len(all_data) - train_data_len - valid_data_len)
train_data, val_data, test_data = random_split(
    all_data, [train_data_len, valid_data_len, test_data_len]
)
train_data.dataset.transform = image_transforms["train"]
val_data.dataset.transform = image_transforms["test"]
test_data.dataset.transform = image_transforms["test"]
print(len(train_data), len(val_data), len(test_data))

train_loader = DataLoader(
    train_data, num_workers=4, pin_memory=True, batch_size=batch_size, shuffle=True
)
val_loader = DataLoader(
    val_data, num_workers=4, pin_memory=True, batch_size=batch_size, shuffle=True
)
test_loader = DataLoader(
    test_data, num_workers=4, pin_memory=True, batch_size=batch_size, shuffle=True
)

trainiter = iter(train_loader)
features, labels = next(trainiter)
print(features.shape, labels.shape)

I now instantiate the model by loading it with pre-trained weights, because it trains faster and achieves better results. Specifically for the Inception model, we'll have to set the aux_logits property to False, otherwise the auxilary classifier included in the Inception v3 model will try to enforce its regularisation, which is unnecessary considering that I use just the feature extraction part of the said model. We can give a look to the model architecture:

In [None]:
model = models.inception_v3(weights=models.Inception_V3_Weights.IMAGENET1K_V1) # or None for no trained weights
model.aux_logits = False
model

I want to make sure that all the layers of the model will be trainable, because I achieved better results wrt them being freezed.

In [None]:
for param in model.parameters():
    param.requires_grad = True

Now I define the Fully connected 2 layer classifier to concatenate at the end of the feature extractor.

In [None]:
n_classes = 120
n_inputs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(n_inputs, 1024),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(1024, n_classes),
    nn.LogSoftmax(dim=1)).to(running_device)
model.fc

Now I load the entire model on the GPU to accelarate the training and I enable the CUDA Benchmark option which after some running it looks for a more efficient way to perform the calculations.
Eventually I choose also the loss function that I will use during the training to compute error rates. In this case I chose an NNLLoss to use toghether with a LogSoftmax (an alternative could have been Softmax + CrossEntropyLoss).

In [None]:
if cuda.is_available():
    model.cuda()
    print("-- Cuda enabled --")
    backends.cudnn.benchmark = True # BENCHMARK FOR SPEED UP---------https://pytorch.org/docs/stable/backends.html#torch-backends-cudnn
criterion = nn.NLLLoss()

Creating the mapping index to class and class to index for my model, making it able to distinguish the predicted classes.

In [None]:
model.class_to_idx = all_data.class_to_idx  # all_data.class_to_idx
model.idx_to_class = {idx: class_ for class_, idx in model.class_to_idx.items()}


def print_class_indexes(raw_indexes_to_classes):
    from numpy import array
    import pandas as pd

    all_classes = array(list(raw_indexes_to_classes))
    all_classes_frame = DataFrame(
        {
            "class_name": all_classes[:, 1],
        }
    )
    print(all_classes_frame)


print_class_indexes(model.idx_to_class.items())

del (print_class_indexes,)
del breed_list, n_classes, n_inputs, features, image_transforms, all_data, val_data

Finally I import the modules containing my training and utility functions, and I start the training with a function which will look for the best possible model by training as much models as specified by me, using an array of precomputed learning rates to train each of such models. 
The training will be interrupted if the early stop condition is met (I chose to stop after 3 epochs without improvement because it usually fastens the training and avoids overfitting).
The pre-calculated lr were obtained with a similar training function, which searches for the best possible model in a similar way, calculating a very large amount of lr by means of a gamma array, so that I can work out the 'direction' in which I can obtain a very good model, and I can use the lr that led to that direction for further runs on the previous training function using a more re-extracted and granular range of learning rates that will revolve around the learning rates obtained from the second function.

I chose an SGD as optimizer for the model.

In [None]:
from training_utils import train, send_telegram_message
from numpy import arange
import traceback
from pruning_based_training_utils import (
    pruning_based_training,
    pruning_based_training_with_precomputed_lrs,
)
import importlib
import pruning_based_training_utils

importlib.reload(pruning_based_training_utils)
from pruning_based_training_utils import (
    pruning_based_training,
    pruning_based_training_with_precomputed_lrs,
)

scheduler = None

try:
    send_telegram_message(
        "".join(
            [
                "///////////////////////////////////////////////\n",
                "///////////////////////////////////////////////\n",
                "///////////////////////////////////////////////\n",
                "CHOSEN SCHEDULER: ",
                type(scheduler).__name__,
                "\n\n-----------------STARTING TRAINING-----------------",
            ]
        )
    )
    optimizer = optim.SGD(
        model.parameters(), lr=0.0074, momentum=0.9, weight_decay=1e-6
    )
    gamma_range = arange(0.01, 1.1, 0.025)
    precomputed_lrs = [
        arange(0.00647, 0.006471, 0.001),
        arange(0.002145, 0.002155, 0.0000025),
        arange(0.00200, 0.00210, 0.000025),
        arange(0.0009224, 0.00092325, 0.00000025),
        arange(0.00012143, 0.000121447, 0.0000000015),  # 000124571
        arange(0.0000755, 0.0000770, 0.0000005),
        arange(0.00003125, 0.00003140, 0.00000005),
    ]
    model, history, performance_dict = pruning_based_training_with_precomputed_lrs(
        original_model=model,
        criterion=criterion,
        original_optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        early_stop=3,
        n_epochs=400,
        precomputed_lrs=precomputed_lrs,
        base_gammas=gamma_range,
    )  # 87.3 % TEST | 87.07% VALID | Validation Loss: 0.4311
    send_telegram_message(
        "".join(
            [
                "---TRAINING COMPLETED---\n",
                "BEST EPOCH: ",
                str(performance_dict["best_epoch"]),
            ]
        )
    )
except Exception as ex:
    error_message = "".join(
        ["--- CNN EXECUTION ERROR! ---:\n", str(ex), "\n", traceback.format_exc()]
    )
    print(error_message)
    send_telegram_message(error_message)

In [None]:
history

I can now test the obtained model and see if it does better than the currently best overall model, saved in the memory as a checkpoint, and eventually overwriting it.
The best model until know achieved a 87.3% test accuracy.

In [None]:
def test(model, test_loader, criterion):
    with no_grad():
        model.eval()
        test_acc = 0
        for data, label in test_loader:
            data, label = data.cuda(), label.cuda()

            output = model(data)

            _, pred = max(output, dim=1)
            correct_tensor = pred.eq(label.data.view_as(pred))
            accuracy = mean(correct_tensor.type(FloatTensor))
            test_acc += accuracy.item() * data.size(0)

        test_acc = test_acc / len(test_loader.dataset)
        return test_acc


def load_model_from_checkpoint(path, load_also_accuracy=False):
    model = models.inception_v3(weights=models.Inception_V3_Weights.IMAGENET1K_V1)
    n_classes = 120
    n_inputs = model.fc.in_features
    model.aux_logits = False
    model.fc = nn.Sequential(
        nn.Linear(n_inputs, 1024),
        nn.ReLU(),
        nn.Dropout(0.4),
        nn.Linear(1024, n_classes),
        nn.LogSoftmax(dim=1),
    )
    test_accuracy = 0
    try:
        torch_checkpoint = load(path)
        model.load_state_dict(torch_checkpoint["model_state_dict"])
        model.idx_to_class = torch_checkpoint["idx_to_class"]
        if load_also_accuracy:
            test_accuracy = torch_checkpoint["test_acc"]
        # model.cuda()
    except:
        return None, None
    return model, test_accuracy


def save_model(model_to_save, test_accuracy):
    save(
        {
            "model_state_dict": model_to_save.state_dict(),
            "idx_to_class": model_to_save.idx_to_class,
            "test_acc": test_accuracy,
        },
        "best_dog_classifier_model.pt",
    )


def compare_models(model, test_loader, criterion, save=True):
    save_threshold = 84.0
    loaded_model, loaded_model_test_acc = load_model_from_checkpoint(
        path="best_dog_classifier_model.pt", load_also_accuracy=True
    )
    test_acc = test(model, test_loader, criterion) * 100
    message = f"Current model test accuracy: {test_acc}.\n"
    if loaded_model == None:
        message += "No model stored found.\n"
        if test_acc > save_threshold:
            save_model(model_to_save=model, test_accuracy=test_acc)
            message += f"The accuracy of the new model overcomes the {save_threshold:.2f} save threshold.\nNew model saved.\n"
        else:
            message += f"The accuracy of the new model DOES NOT overcome the {save_threshold:.2f} save threshold.\n"
    else:
        if test_acc > loaded_model_test_acc:
            save_model(model_to_save=model, test_accuracy=test_acc)
            message += f"The accuracy of the new model overcomes the accuracy of the saved model.\nNew model saved.\n"
        else:
            message += f"The accuracy of the new model DOES NOT overcome the accuracy of the saved model.\nAccuracy of the save model: {loaded_model_test_acc}\n"
    print(message)
    send_telegram_message(message)

In [None]:
model_to_check, _ = load_model_from_checkpoint(
    path="pruning_best_global_model.pt", load_also_accuracy=False
)

compare_models(
    model=model_to_check.cuda(),
    test_loader=test_loader,
    criterion=criterion,
)

The accuracy achieved is very good considering that some dog breeds can be extremely similar and that we ourselves may not be able to distinguish them with our eyes, due to very subtle differential characteristics such as fur colours.

In [None]:
def evaluate(model, test_loader):

    classes = []
    acc_results = zeros(len(test_loader.dataset))
    i = 0
    model.eval()
    with no_grad():
        for data, labels in test_loader:
            data, labels = data.cuda(), labels.cuda()
            output = model(data)
            for pred, true in zip(output, labels):
                _, pred = pred.unsqueeze(0).topk(1)
                correct = pred.eq(true.unsqueeze(0))
                acc_results[i] = correct.cpu()
                classes.append(model.idx_to_class[true.item()][10:])
                i += 1
    results = DataFrame({"class": classes, "results": acc_results})
    results = results.groupby(classes).mean()

    return results

evaluate(model_to_check, test_loader)