In [1]:
! python --version

Python 3.10.14


In [2]:
%%capture
! pip install --upgrade pip
! pip install timm
! pip install torch
! pip install pillow
! pip install ipywidgets
! pip uninstall numpy -y    # uninstall existing numpy (some components need pre 2.0)
! pip install "numpy<2.0"   # install the latest numpy 1.x version
! pip install opencv-python
! pip install scikit-learn
! pip install scipy
! pip install matplotlib
! pip install torchinfo

In [3]:
import torch
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import time

start = time.time()
model_no_fc_ready_to_fine_tune = timm.create_model("resnet50d.a1_in1k", pretrained=True, num_classes=10)
resnet_model = model_no_fc_ready_to_fine_tune

data_config = resolve_data_config({}, model=resnet_model)
transform = create_transform(**data_config)
train_dataset = datasets.ImageFolder(root='../kaggle_data/bird-species/train', transform=transform)
test_dataset = datasets.ImageFolder(root='../kaggle_data/bird-species/test', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

if torch.backends.mps.is_available():
    device = "mps"
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

resnet_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(resnet_model.parameters(), lr=0.001)

num_epochs = 8
training_start = time.time()
for epoch in range(num_epochs):
    epoch_start = time.time()
    resnet_model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = resnet_model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    training_loss = running_loss / len(train_loader)
    print(f'\nEpoch [{epoch+1}/{num_epochs}], Training Loss: {round(training_loss, 5)}')
    
    train_accuracy = 100 * (correct / total)
    print(f'Training Accuracy: {round(train_accuracy, 5)}%')

    # Evaluate on test set
    resnet_model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = resnet_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            
    test_loss = running_loss / len(test_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Loss: {round(test_loss, 5)}')
    
    test_accuracy = 100 * correct / total
    print(f'Test Accuracy: {round(test_accuracy, 5)}%')
    
    epoch_end = time.time()
    elapsed_time = epoch_end - epoch_start
    
    print(f"Epoch ({epoch+1}/{num_epochs}) time: {elapsed_time}")
    elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    
training_end = time.time()
elapsed_time = training_end - training_start

elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
print(f"Total training time: {elapsed_time}")

Using device: mps

Epoch [1/8], Training Loss: 0.74693
Training Accuracy: 81.07433%
Epoch [1/8], Test Loss: 0.08056
Test Accuracy: 96.0%
Epoch (1/8) time: 18.754597187042236

Epoch [2/8], Training Loss: 0.13108
Training Accuracy: 96.56465%
Epoch [2/8], Test Loss: 0.07348
Test Accuracy: 98.0%
Epoch (2/8) time: 17.523292064666748

Epoch [3/8], Training Loss: 0.10195
Training Accuracy: 97.4391%
Epoch [3/8], Test Loss: 0.12698
Test Accuracy: 94.0%
Epoch (3/8) time: 18.089369773864746

Epoch [4/8], Training Loss: 0.05772
Training Accuracy: 98.43848%
Epoch [4/8], Test Loss: 0.13533
Test Accuracy: 98.0%
Epoch (4/8) time: 19.02901291847229

Epoch [5/8], Training Loss: 0.06753
Training Accuracy: 98.06371%
Epoch [5/8], Test Loss: 0.02986
Test Accuracy: 98.0%
Epoch (5/8) time: 19.039597988128662

Epoch [6/8], Training Loss: 0.03611
Training Accuracy: 99.25047%
Epoch [6/8], Test Loss: 0.01149
Test Accuracy: 100.0%
Epoch (6/8) time: 18.50385808944702

Epoch [7/8], Training Loss: 0.06082
Training Ac

In [4]:
from torchinfo import summary

summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 10]                   --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [5]:
# Remove the classification head, because we just need the output of the embedding model, and not the classification head
resnet_model.reset_classifier(0)  # save model with the classification head, remove this later after loading

In [6]:
summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 2048]                 --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [7]:
from PIL import Image
from torchvision import transforms
import numpy as np

def generate_embedding(image_path):
    # Define the transformation to be applied to the input image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # or the size used during training
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load and preprocess the input image
    image = Image.open(image_path)
    input_tensor = transform(image.convert('RGB')).unsqueeze(0)

    # Generate embeddings
    with torch.no_grad():
        embedding = resnet_model(input_tensor)
    
    embedding = embedding.numpy().flatten()
    print(f'Embedding shape: {embedding.shape}')

    return embedding

generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.       , 0.       , 0.       , ..., 1.6206954, 0.       ,
       0.       ], dtype=float32)

In [8]:
from PIL import Image
from torchvision import transforms
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_(embedding1, embedding2):
  embedding1 = np.array(embedding1).reshape(1, -1)
  embedding2 = np.array(embedding2).reshape(1, -1)
  return cosine_similarity(embedding1, embedding2)[0][0]

def generate_embedding(image_path):
    # Define the transformation to be applied to the input image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # or the size used during training
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load and preprocess the input image
    image = Image.open(image_path)
    input_tensor = transform(image.convert('RGB')).unsqueeze(0)

    # Generate embeddings
    with torch.no_grad():
        embedding = resnet_model(input_tensor)
    
    embedding = embedding.numpy().flatten()
    print(f'Embedding shape: {embedding.shape}')

    return embedding

generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.       , 0.       , 0.       , ..., 1.6206954, 0.       ,
       0.       ], dtype=float32)

In [9]:
# similar/same birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/5.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.8410090208053589



In [10]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BOOBY/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.2604087293148041



In [11]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/misc/YELLOW HEADED BLACKBIRD/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.2983385920524597



### Comments about using pre-trained model to generate embeddings
Now that we have fine-tuned our embedding model against our bird species dataset, we can then use it to generate embeddings for our bird species images. This is done by passing the images through the model and extracting the embeddings from the last layer of the model. Because we have removed the classification head, our model now just spits out the embeddings of the image we pass in. 

So now (compared to just using the pre-trained model) the same bird species will have embeddings that are closer together than then embeddings of different bird species. This is because the model has been fine-tuned to recognise the specific features of the bird species in our dataset.

For example, in the similar case we get a score of `0.90834`, while the dissimilar image case gets a score of `0.31791`. This is a significant difference compared to using the pre-trained model, and shows that this new fine-tuned model has learned to differentiate between the two images better.