In [23]:
! python --version

Python 3.10.12


In [None]:
! pip install --upgrade pip
! pip install timm
! pip install torch
! pip install pillow
! pip install ipywidgets
! pip uninstall numpy -y    # uninstall existing numpy (some components need pre 2.0)
! pip install "numpy<2.0"   # install the latest numpy 1.x version
! pip install opencv-python
! pip install scikit-learn
! pip install scipy
! pip install matplotlib

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy<2.0
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4


In [24]:
import torch
import timm
import torch.nn as nn

pretrained_models = timm.list_models("", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained models.")

timm currently contains 1434 pre-trained models.


In [25]:
pretrained_models = timm.list_models("resnet*", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained resnet models.")

timm currently contains 116 pre-trained resnet models.


In [26]:
pretrained_models = timm.list_models("vit*", pretrained=True)

print(f"timm currently contains {len(pretrained_models)} pre-trained vision transformer models.")

timm currently contains 206 pre-trained vision transformer models.


In [27]:
resnet_model = timm.create_model("resnet50d.a1_in1k", pretrained=True)
resnet_model.default_cfg

{'url': 'https://github.com/huggingface/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50d_a1_0-e20cff14.pth',
 'hf_hub_id': 'timm/resnet50d.a1_in1k',
 'architecture': 'resnet50d',
 'tag': 'a1_in1k',
 'custom_load': False,
 'input_size': (3, 224, 224),
 'test_input_size': (3, 288, 288),
 'fixed_input_size': False,
 'interpolation': 'bicubic',
 'crop_pct': 0.95,
 'test_crop_pct': 1.0,
 'crop_mode': 'center',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'num_classes': 1000,
 'pool_size': (7, 7),
 'first_conv': 'conv1.0',
 'classifier': 'fc',
 'origin_url': 'https://github.com/huggingface/pytorch-image-models',
 'paper_ids': 'arXiv:2110.00476'}

In [28]:
resnet_model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [29]:
resnet_model.reset_classifier(0)  # Remove the classification head

In [30]:
resnet_model

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Co

In [32]:
%cd /content/drive/MyDrive/Dev/bird-species-embedding-model/notebooks/
%pwd

/content/drive/MyDrive/Dev/bird-species-embedding-model/notebooks


'/content/drive/MyDrive/Dev/bird-species-embedding-model/notebooks'

In [33]:
from PIL import Image
from torchvision import transforms
import numpy as np

def generate_embedding(image_path):
    # Define the transformation to be applied to the input image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # or the size used during training
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load and preprocess the input image
    # image_path = "./data/base/e5/e5.png"
    image = Image.open(image_path)
    input_tensor = transform(image.convert('RGB')).unsqueeze(0)

    # Generate embeddings
    with torch.no_grad():
        embedding = resnet_model(input_tensor)

    embedding = embedding.numpy()
    # print(f'Embedding all shape: {embedding.shape}')
    # print(f'Embedding all type: {type(embedding)}')
    # print(f'Embeddings: {embedding[0][:10]}')
    print(f'Embedding shape: {embedding[0].shape}')

    # embeddings = embeddings[0].tolist()
    # print(f'Embeddings: {embeddings[:10]}')
    # print(f'Embedding shape: {len(embeddings)}')

    return embedding.flatten()

generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/1.jpg")

Embedding shape: (2048,)


array([0.03984645, 0.04050619, 0.00203123, ..., 0.4833673 , 0.07411793,
       0.01539826], dtype=float32)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_(embedding1, embedding2):
  embedding1 = np.array(embedding1).reshape(1, -1)
  embedding2 = np.array(embedding2).reshape(1, -1)
  return cosine_similarity(embedding1, embedding2)[0][0]

In [35]:
# similar/same birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/5.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.8865870833396912



In [36]:
# dissimilar/different birds
embedding1 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BABBLER/4.jpg")
embedding2 = generate_embedding("../kaggle_data/bird-species/test/ABBOTTS BOOBY/1.jpg")

similarity = cosine_similarity_(embedding1, embedding2)
print(f"Cosine Similarity of is {similarity}\n")

Embedding shape: (2048,)
Embedding shape: (2048,)
Cosine Similarity of is 0.8602468967437744



### Comments about using pre-trained model to generate embeddings
We're using ResNet50d (resnet50d.a1_in1k) to generate embeddings for two sets of images. If we take embeddings of the 2 images of the same species and take the cosine similarity between them, we can see that the similarity is very high, we achieve `0.88659` score. However, if we take embeddings of the 2 images of different species, the similarity is still high, but a bit lower, here we can see `0.86025` similarity rating. This may still be ok depending on the use case.

To get better embeddings, we can then fine-tune the model against our dataset, so that similar images embedding will be quite different to the embeddings of different images. This way, we can get better embeddings, which can be the basis of a good classifier.