<a href="https://colab.research.google.com/github/jay05Hawk/Proj_kagg/blob/main/Universal_Image_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Table of Contents :

1. Imports
2. Hyperparameters 
3. List of pretrained models in Pytorch Image Library
  
  3.1 Examples
4. Model Building

  4.1 Model Class
  
  4.2 Convnext Xlarge

  4.3 Vision Transformer
5. Model Ensembling

  5.1. Ensemble Class

  5.2. Ensembling
  
  5.3. Saving the model

Notebook Overview :

This notebook contains ensembling of Vision transformer(Vit) and Convnext_xlarge model .

The models are loaded using the pytorch image library [(timm)](https://timm.fast.ai/)

The models are created using the `Pytorch` Library

In [3]:
pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.12-py3-none-any.whl (549 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, timm
Successfully installed huggingface-hub-0.12.0 timm-0.6.12


#1. Imports

In [4]:
import os
import cv2
import timm
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models , transforms
from zipfile import ZipFile

#2. Hyperparameters

In [5]:
class config : 
    IMG_WIDTH = 224
    IMG_HEIGHT = 224
    SHUFFLE = True
    NUM_WORKERS = 2
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    NORMALIZE = True
    BATCH_BLENDING = True
    BACKBONE = ["vit_small_r26_s32_224_in21k" ,"convnext_xlarge_in22k"]

#3. List of pretrained models in Pytorch Image Library

In [7]:
avail_pretrained_models = timm.list_models(pretrained=True)
print(f"The total number of models available in pytorch image library is {len(avail_pretrained_models)}")


The total number of models available in pytorch image library is 770


In [8]:
avail_pretrained_models[:10]

['adv_inception_v3',
 'bat_resnext26ts',
 'beit_base_patch16_224',
 'beit_base_patch16_224_in22k',
 'beit_base_patch16_384',
 'beit_large_patch16_224',
 'beit_large_patch16_224_in22k',
 'beit_large_patch16_384',
 'beit_large_patch16_512',
 'beitv2_base_patch16_224']

#4. Model Building

##4.1 model class

In [9]:
class Model(nn.Module):
    def __init__(self , model_name , target_size = [config.IMG_HEIGHT , config.IMG_WIDTH] , normalize = True):
        super().__init__()
        self.target_size = target_size
        self.classifier = timm.create_model(model_name , pretrained = True , num_classes = 0)
        self.layer = nn.AdaptiveAvgPool1d(64)
        self.normalize = normalize
    
    def forward(self ,x):
        x = transforms.functional.resize(x , size = self.target_size)
        x = x/255.0
        x = transforms.functional.normalize(x , mean = [0.485 , 0.456 ,0.406] , std = [0.229 , 0.224 ,0.225])
        x = self.classifier(x) 
        x = self.layer(x)
        if self.normalize:
            x = F.normalize(x)
        return x

##4.2. Convnext Xlarge :

In [10]:
model = Model("convnext_xlarge_in22k" ,normalize = True)
model.eval()

Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth" to /root/.cache/torch/hub/checkpoints/convnext_xlarge_22k_224.pth


Model(
  (classifier): ConvNeXt(
    (stem): Sequential(
      (0): Conv2d(3, 256, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((256,), eps=1e-06, elementwise_affine=True)
    )
    (stages): Sequential(
      (0): ConvNeXtStage(
        (downsample): Identity()
        (blocks): Sequential(
          (0): ConvNeXtBlock(
            (conv_dw): Conv2d(256, 256, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=256)
            (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
            (mlp): Mlp(
              (fc1): Linear(in_features=256, out_features=1024, bias=True)
              (act): GELU()
              (drop1): Dropout(p=0.0, inplace=False)
              (fc2): Linear(in_features=1024, out_features=256, bias=True)
              (drop2): Dropout(p=0.0, inplace=False)
            )
            (drop_path): Identity()
          )
          (1): ConvNeXtBlock(
            (conv_dw): Conv2d(256, 256, kernel_size=(7, 7), stride=(1, 1), padding=(3, 

##4.3. Vision Transformer :

In [11]:
model_2 = Model("vit_small_r26_s32_224_in21k" , normalize  = True)
model_2.eval()

Model(
  (classifier): VisionTransformer(
    (patch_embed): HybridEmbed(
      (backbone): ResNetV2(
        (stem): Sequential(
          (conv): StdConv2dSame(3, 64, kernel_size=(7, 7), stride=(2, 2), bias=False)
          (norm): GroupNormAct(
            32, 64, eps=1e-05, affine=True
            (drop): Identity()
            (act): ReLU(inplace=True)
          )
          (pool): MaxPool2dSame(kernel_size=(3, 3), stride=(2, 2), padding=(0, 0), dilation=(1, 1), ceil_mode=False)
        )
        (stages): Sequential(
          (0): ResNetStage(
            (blocks): Sequential(
              (0): Bottleneck(
                (downsample): DownsampleConv(
                  (conv): StdConv2dSame(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                  (norm): GroupNormAct(
                    32, 256, eps=1e-05, affine=True
                    (drop): Identity()
                    (act): Identity()
                  )
                )
                (conv1): StdC

#5. Model Ensembling
##5.1. Ensemble Class :

In [12]:
class Ensemble(nn.Module):
    def __init__(self , encoders , normalize = True):
        super().__init__()
        for index , encoder in enumerate(encoders):
            setattr(self,f"encoder{index}" , encoder)
        self.num_encoders = len(encoders)
        self.normalize = normalize
    
    def forward(self,x):
        output = []
        for name , encoder in self.named_children():
            output.append(encoder(x))
        output = torch.cat(output , dim =0)
        output = F.normalize(output)
        output = output.mean(dim = 0).unsqueeze(0)
        if self.normalize:
            output = F.normalize(output)
        return output

##5.2. Ensembling :

In [13]:
encoders = []
encoders.append(model)
encoders.append(model_2)
ensemble_model = Ensemble(encoders , normalize = True)
ensemble_model.eval()

Ensemble(
  (encoder0): Model(
    (classifier): ConvNeXt(
      (stem): Sequential(
        (0): Conv2d(3, 256, kernel_size=(4, 4), stride=(4, 4))
        (1): LayerNorm2d((256,), eps=1e-06, elementwise_affine=True)
      )
      (stages): Sequential(
        (0): ConvNeXtStage(
          (downsample): Identity()
          (blocks): Sequential(
            (0): ConvNeXtBlock(
              (conv_dw): Conv2d(256, 256, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=256)
              (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=256, out_features=1024, bias=True)
                (act): GELU()
                (drop1): Dropout(p=0.0, inplace=False)
                (fc2): Linear(in_features=1024, out_features=256, bias=True)
                (drop2): Dropout(p=0.0, inplace=False)
              )
              (drop_path): Identity()
            )
            (1): ConvNeXtBlock(
              (conv

In [14]:
ensemble_model(torch.randn((1,3,224,224))).shape

torch.Size([1, 64])

##5.3. Saving the model :

In [15]:
saved_model = torch.jit.script(ensemble_model)
saved_model.save('saved_model.pt')