In [1]:
! python --version

Python 3.10.14


In [2]:
%%capture
! pip install --upgrade pip
! pip install timm
! pip install torch
! pip install pillow
! pip install ipywidgets
! pip uninstall numpy -y
! pip install "numpy<2.0"
! pip install opencv-python
! pip install scikit-learn
! pip install scipy
! pip install matplotlib
! pip install torchinfo

## Fine-tuning the Resnet 50 model with our bird species dataset

Using PyTorch, we run a fine-tuning model training on the Resnet 50 model with our bird species dataset. The model is trained on the training dataset and validated on the test dataset. For this notebook, we"re running the training against a smaller subset of 10 bird species. With trial and error, I"ve found that an epoch setting or 8, batch size of 32, and a learning rate of 0.001 works well for this version dataset. When doing this for real, it would be best to run the training against the full dataset.


In [2]:
import torch
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import time
import os

start = time.time()

train_folder = "../kaggle_data/bird-all/train"
test_folder = "../kaggle_data/bird-all/test"

num_of_classes = len(os.listdir(train_folder))

model_no_fc_ready_to_fine_tune = timm.create_model("resnet50d.a1_in1k", pretrained=True, num_classes=num_of_classes)
resnet_model = model_no_fc_ready_to_fine_tune

data_config = resolve_data_config({}, model=resnet_model)
transform = create_transform(**data_config)
train_dataset = datasets.ImageFolder(root=train_folder, transform=transform)
test_dataset = datasets.ImageFolder(root=test_folder, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

if torch.backends.mps.is_available():
    device = "mps"
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

resnet_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(resnet_model.parameters(), lr=0.001)

num_epochs = 8
training_start = time.time()
for epoch in range(num_epochs):
    epoch_start = time.time()
    resnet_model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = resnet_model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    training_loss = running_loss / len(train_loader)
    print(f"\nEpoch [{epoch+1}/{num_epochs}], Training Loss: {round(training_loss, 5)}")
    
    train_accuracy = 100 * (correct / total)
    print(f"Training Accuracy: {round(train_accuracy, 5)}%")

    # Evaluate on test set
    resnet_model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = resnet_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            
    test_loss = running_loss / len(test_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Test Loss: {round(test_loss, 5)}")
    
    test_accuracy = 100 * correct / total
    print(f"Test Accuracy: {round(test_accuracy, 5)}%")
    
    epoch_end = time.time()
    elapsed_time = epoch_end - epoch_start
    
    print(f"Epoch ({epoch+1}/{num_epochs}) time: {elapsed_time}")
    elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    
training_end = time.time()
elapsed_time = training_end - training_start

elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
print(f"Total training time: {elapsed_time}")

Using device: mps

Epoch [1/8], Training Loss: 0.84027
Training Accuracy: 81.63289%
Epoch [1/8], Test Loss: 0.10456
Test Accuracy: 96.8381%
Epoch (1/8) time: 770.3930938243866

Epoch [2/8], Training Loss: 0.20402
Training Accuracy: 94.47155%
Epoch [2/8], Test Loss: 0.1025
Test Accuracy: 96.91429%
Epoch (2/8) time: 765.0102081298828

Epoch [3/8], Training Loss: 0.13106
Training Accuracy: 96.3124%
Epoch [3/8], Test Loss: 0.0926
Test Accuracy: 96.99048%
Epoch (3/8) time: 740.4205451011658

Epoch [4/8], Training Loss: 0.09765
Training Accuracy: 97.13239%
Epoch [4/8], Test Loss: 0.06258
Test Accuracy: 98.24762%
Epoch (4/8) time: 742.6552686691284

Epoch [5/8], Training Loss: 0.07771
Training Accuracy: 97.77752%
Epoch [5/8], Test Loss: 0.09
Test Accuracy: 97.14286%
Epoch (5/8) time: 734.9225969314575

Epoch [6/8], Training Loss: 0.06556
Training Accuracy: 98.09062%
Epoch [6/8], Test Loss: 0.08126
Test Accuracy: 97.6%
Epoch (6/8) time: 721.0921971797943

Epoch [7/8], Training Loss: 0.05814
Tr

In [6]:
from torchinfo import summary

summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 526]                  --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [7]:
# Remove the classification head, because we just need the output of the embedding model, and not the classification head
resnet_model.reset_classifier(0)  # save model with the classification head, remove this later after loading

In [8]:
summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 2048]                 --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [9]:
from datetime import datetime

model_path = f"resnet50d.a1_in1k_fine_tune_{num_of_classes}_classes_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pth"
torch.save(resnet_model.state_dict(), model_path, )

end = time.time()

In [15]:
import torch
from PIL import Image
from torchvision import transforms
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_(embedding1, embedding2):
    """
    Create a cosine similarity function that takes two embeddings and returns the cosine similarity between them.
    """    
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

def generate_embedding(image_path):
    """
    Generate embeddings for the input image.
    """
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=resnet_model.default_cfg["mean"],
            std=resnet_model.default_cfg["std"]),
    ])

    image = Image.open(image_path)
    input_tensor = transform(image.convert("RGB")).unsqueeze(0).to(device)
    
    resnet_model.to(device)
    resnet_model.eval()
    
    with torch.no_grad():
        embedding = resnet_model(input_tensor)
    
    embedding = embedding.cpu().numpy().flatten()
    print(f"Embedding shape: {embedding.shape}")

    return embedding

generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.0201393, 0.       , 0.       , ..., 0.       , 0.       ,
       0.       ], dtype=float32)

In [23]:
# similar/same birds
embedding1 = generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BABBLER/1.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BABBLER/2.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.6749578714370728



In [18]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BOOBY/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.21218827366828918



In [19]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-all/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-all/test/YELLOW HEADED BLACKBIRD/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.1758386492729187



### Comments about using pre-trained model to generate embeddings
Now that we have fine-tuned our embedding model against our bird species dataset, we can then use it to generate embeddings for our bird species images. This is done by passing the images through the model and extracting the embeddings from the last layer of the model. Because we have removed the classification head, our model now just spits out the embeddings of the image we pass in. 

So now (compared to just using the pre-trained model) the same bird species will have embeddings that are closer together and embeddings of different bird species are now quite far apart. This is because the model has been fine-tuned to recognise the specific features of the bird species in our dataset.

For example, in the similar case we get a score of `0.79369`, while the dissimilar image case gets a score of `0.33488` and `0.31218`. This is a significant difference compared to using the pre-trained model, and shows that this new fine-tuned model has learned to differentiate between the two images better.