In [1]:
! python --version

Python 3.10.14


In [2]:
%%capture
! pip install --upgrade pip
! pip install timm
! pip install torch
! pip install pillow
! pip install ipywidgets
! pip uninstall numpy -y    # uninstall existing numpy (some components need pre 2.0)
! pip install "numpy<2.0"   # install the latest numpy 1.x version
! pip install opencv-python
! pip install scikit-learn
! pip install scipy
! pip install matplotlib
! pip install torchinfo

In [3]:
import torch
import timm
import torch.nn as nn

pretrained_models = timm.list_models("", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained models.")

timm currently contains 1457 pre-trained models.


In [4]:
pretrained_models = timm.list_models("resnet*", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained resnet models.")

timm currently contains 117 pre-trained resnet models.


In [5]:
pretrained_models = timm.list_models("vit*", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained vision transformer models.")

timm currently contains 212 pre-trained vision transformer models.


In [6]:
resnet_model = timm.create_model("resnet50d.a1_in1k", pretrained=True)
resnet_model.default_cfg

{'url': 'https://github.com/huggingface/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50d_a1_0-e20cff14.pth',
 'hf_hub_id': 'timm/resnet50d.a1_in1k',
 'architecture': 'resnet50d',
 'tag': 'a1_in1k',
 'custom_load': False,
 'input_size': (3, 224, 224),
 'test_input_size': (3, 288, 288),
 'fixed_input_size': False,
 'interpolation': 'bicubic',
 'crop_pct': 0.95,
 'test_crop_pct': 1.0,
 'crop_mode': 'center',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'num_classes': 1000,
 'pool_size': (7, 7),
 'first_conv': 'conv1.0',
 'classifier': 'fc',
 'origin_url': 'https://github.com/huggingface/pytorch-image-models',
 'paper_ids': 'arXiv:2110.00476'}

In [7]:
from torchinfo import summary

summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 1000]                 --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [8]:
resnet_model.reset_classifier(0)  # Remove the classification head

In [9]:
summary(resnet_model, input_size=(1, 3, 224, 224))  # Batch size of 1

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 2048]                 --
├─Sequential: 1-1                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─BatchNorm2d: 2-2                  [1, 32, 112, 112]         64
│    └─ReLU: 2-3                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         9,216
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         18,432
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         128
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 256, 56, 56]          --
│    └

In [10]:
from PIL import Image
from torchvision import transforms
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_(embedding1, embedding2):
  embedding1 = np.array(embedding1).reshape(1, -1)
  embedding2 = np.array(embedding2).reshape(1, -1)
  return cosine_similarity(embedding1, embedding2)[0][0]

def generate_embedding(image_path):
    # Define the transformation to be applied to the input image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # or the size used during training
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load and preprocess the input image
    image = Image.open(image_path)
    input_tensor = transform(image.convert('RGB')).unsqueeze(0)

    # Generate embeddings
    with torch.no_grad():
        embedding = resnet_model(input_tensor)
    
    embedding = embedding.numpy().flatten()
    print(f'Embedding shape: {embedding.shape}')

    return embedding

generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.03984641, 0.04050694, 0.00203133, ..., 0.48336792, 0.07411849,
       0.0153979 ], dtype=float32)

In [11]:
# similar/same birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/5.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.8865871429443359



In [12]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BOOBY/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.8602464199066162



### Comments about using pre-trained model to generate embeddings

We can use a pre-trained model to generate embeddings for images. The embeddings can be used to compare images, and we can use the similarity between the embeddings to determine if the images are similar or not.

However if we are building a bird similarity checker, we need to be careful about the embeddings generated by the pre-trained model. The embeddings generated by the pre-trained model may not be good enough to differentiate between similar images. This is because the pre-trained model was trained on a different dataset, and the embeddings generated by the model may not be good enough to differentiate between similar images.

In this notebook, we are using ResNet50d (resnet50d.a1_in1k) to generate embeddings for two sets of images. If we take embeddings of the **2 images of the same species** and take the cosine similarity between them, we can see that the similarity is very high, we achieve `0.88659` score.

However, instead if we take embeddings of the **2 images of different bird species**, the similarity is still high, but a bit lower, here we can see `0.86025` similarity rating. It still works, because eventually the similarity results are sorted and the most similar images are shown first. However the similarity scores are very close together.

To get better embeddings that is representative of the task at hand (bird similarity checker), we can fine-tune the pre-trained model against our birds dataset, so that the vector embeddings of similar images will be close together while embeddings of different images will be further apart. This is because after fine-tuning against our dataset, the model will learn to differentiate between similar images and generate embeddings that are more representative of the dataset and task at hand.