In [None]:
! python --version

In [None]:
%%capture
! pip install --upgrade pip
! pip install timm
! pip install torch
! pip install pillow
! pip install ipywidgets
! pip uninstall numpy -y    # uninstall existing numpy (some components need pre 2.0)
! pip install "numpy<2.0"   # install the latest numpy 1.x version
! pip install opencv-python
! pip install scikit-learn
! pip install scipy
! pip install matplotlib

In [10]:
import torch
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import time

start = time.time()
model_no_fc_ready_to_fine_tune = timm.create_model("resnet50d.a1_in1k", pretrained=True, num_classes=10)
model = model_no_fc_ready_to_fine_tune

# Prepare dataset
data_config = resolve_data_config({}, model=model)
transform = create_transform(**data_config)
train_dataset = datasets.ImageFolder(root='../kaggle_data/bird-species/train', transform=transform)
val_dataset = datasets.ImageFolder(root='../kaggle_data/bird-species/test', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=8, pin_memory=True)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}')

    # Evaluate on validation set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Validation Accuracy: {100 * correct / total}%')

end = time.time()
elapsed_time = end - start
elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
print(f"Elapsed time: {elapsed_time}")

Using device: cpu
Epoch [1/10], Loss: 2.1997193962919948
Validation Accuracy: 78.0%
Epoch [2/10], Loss: 1.8710663038141586
Validation Accuracy: 94.0%
Epoch [3/10], Loss: 1.2643017979229199
Validation Accuracy: 96.0%
Epoch [4/10], Loss: 0.6563213280602997
Validation Accuracy: 100.0%
Epoch [5/10], Loss: 0.3186425096264072
Validation Accuracy: 100.0%
Epoch [6/10], Loss: 0.19346192303825827
Validation Accuracy: 100.0%
Epoch [7/10], Loss: 0.12614998177570455
Validation Accuracy: 100.0%
Epoch [8/10], Loss: 0.11078326758362499
Validation Accuracy: 100.0%
Epoch [9/10], Loss: 0.09379157833024568
Validation Accuracy: 100.0%
Epoch [10/10], Loss: 0.07495722083338335
Validation Accuracy: 100.0%
Elapsed time: 00:54:05


In [11]:
model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [3]:
# Remove the classification head, because we just need the output of the embedding model, and not the classification head
# model.reset_classifier(0)  # save model with the classification head, remove this later after loading

In [4]:
model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [12]:
# import numpy as np

# def extract_embeddings(dataloader):
#     model.eval()
#     embeddings = []
#     labels_list = []
#     with torch.no_grad():
#         for inputs, labels in dataloader:
#             inputs = inputs.to(device)
#             emb = model(inputs)
#             embeddings.append(emb.cpu().numpy())
#             labels_list.append(labels.numpy())
#     return np.concatenate(embeddings), np.concatenate(labels_list)

# # Extract embeddings for the training and validation datasets
# # train_embeddings, train_labels = extract_embeddings(train_loader)
# # val_embeddings, val_labels = extract_embeddings(val_loader)


In [7]:
# print(f"Train embedding Shape: {train_embeddings.shape}")
# print(f"Train embedding item shape: {train_embeddings[0].shape}")

# print(f"Train embedding slice: {train_embeddings[0][:10]}")

# print(f"Test embedding Shape: {val_embeddings.shape}")



Train embedding Shape: (1601, 2048)
Train embedding item shape: (2048,)
Train embedding slice: [0.1307108  0.03854156 0.         0.04950881 0.         0.
 0.         0.02138993 0.         0.0384184 ]
Test embedding Shape: (50, 2048)


In [13]:
import torch

# Assuming `model` is your fine-tuned model
model_path = 'resnet50d_fine_tune_10_bird_species_with_fc.pth'

# Save the model's state dictionary
torch.save(model.state_dict(), model_path)


In [15]:
import torch
import torchvision.models as models
import torch.nn as nn
import timm

# Define the model architecture (same as the one used for fine-tuning)
resnet_model = timm.create_model('resnet50d', pretrained=False)
resnet_model.reset_classifier(10) # 10 is the number of classes in the dataset

# Load the saved state dictionary
model_path = './resnet50d_fine_tune_10_bird_species_with_fc.pth'
resnet_model.load_state_dict(torch.load(model_path))

# Set the model to evaluation mode if you are using it for inference
resnet_model.eval()

# If you are planning to continue training, set the model to training mode
# model.train()


ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [16]:
resnet_model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [17]:
# Remove the classification head, because we just need the output of the embedding model, and not the classification head
resnet_model.reset_classifier(0)  # remove the classification head

In [18]:
resnet_model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
from torchvision import transforms
import numpy as np

def cosine_similarity_(embedding1, embedding2):
  embedding1 = np.array(embedding1).reshape(1, -1)
  embedding2 = np.array(embedding2).reshape(1, -1)
  return cosine_similarity(embedding1, embedding2)[0][0]

def generate_embedding(image_path):
    # Define the transformation to be applied to the input image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # or the size used during training
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    image = Image.open(image_path)
    input_tensor = transform(image.convert('RGB')).unsqueeze(0)

    # Generate embeddings
    with torch.no_grad():
        embedding = resnet_model(input_tensor)
    
    embedding = embedding.numpy()
    # print(f'Embedding all shape: {embedding.shape}')
    # print(f'Embedding all type: {type(embedding)}')
    # print(f'Embeddings: {embedding[0][:10]}')
    print(f'Embedding shape: {embedding[0].shape}')

    # embeddings = embeddings[0].tolist()
    # print(f'Embeddings: {embeddings[:10]}')
    # print(f'Embedding shape: {len(embeddings)}')

    return embedding.flatten()

generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.        , 0.10148961, 0.        , ..., 0.0483814 , 0.        ,
       0.01243757], dtype=float32)

In [20]:
# similar/same birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/5.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.9348911046981812



In [21]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BOOBY/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.233840212225914



In [22]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/misc/YELLOW HEADED BLACKBIRD/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.4032496213912964



### Comments about using pre-trained model to generate embeddings
Now that we have fine-tuned our embedding model against our bird species dataset, we can then use it to generate embeddings for our bird species images. This is done by passing the images through the model and extracting the embeddings from the last layer of the model. Because we have removed the classification head, our model now just spits out the embeddings of the image we pass in. 

So now (compared to just using the pre-trained model) similar images will have a substantially higher similarity than a dissimilar image. This is because the model has been fine-tuned to recognise the specific features of the bird species in our dataset.

For example, in the similar case we get a score of `0.90834`, while the dissimilar image case gets a score of `0.31791`. This is a significant difference compared to using the pre-trained model, and shows that the model has learned to differentiate between the two images better.