In [2]:
!pip install kaggle



In [3]:
!pip install timm

Collecting timm
  Downloading timm-0.9.8-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from timm)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors (from timm)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, huggingface-hub, timm
Successfully installed huggingface-hub-0.18.0 safetensors-0.4.0 timm-0.9.8


In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.18.0
    Uninstalling huggingface-hub-0.18.0:
      Successfully uninstalled huggingface-hub-0.18.0
Successfully i

In [16]:
import torch
from torch import nn
import timm
from transformers import DistilBertModel , DistilBertConfig
import torch.nn.functional as F

In [8]:
class Configure():

  #for caption tokenizer
  max_length = 200


  #for text and image encoder
  image_model = 'resnet50'
  image_embedding = 2048
  text_model = 'distilbert-base-uncased'
  text_embedding = 768
  text_tokenizer = 'distilbert-base-uncased'

  pretrained = True
  trainable = True

  #For projection head
  projection_dim = 256
  dropout = 0.1
  num_projection_layers = 1


  #For CLIP model
  temperature = 1.0



In [None]:
#image_filenames and captions must have same length so if there are multiple captions of same image image_filename must be same for all.


class CLIPdata(torch.utils.data.Dataset):
  def __init__(self,image_filenames,captions,tokenizer,transform):
    self.image_filenames = image_filenames
    self.captions = list(captions)
    self.encoded_options = tokenizer(list(captions),padding=True.truncation=True,max_length = Configure.max_length)
    self.transform = transform


  def __getitem__(self,idx):
    item = {
        key: torch.tensor(values[idx])
        for key, values in self.encoded_options.items() }

        image = cv2.imread(f"{Configure.image_path}/{self.image_filenames[idx]}")
        image = cv2.cvtColor(image,COLOR_BGR2RGB)
        image = self.transform(image=image)['image']

        item['image'] = torch.tensor(image).permute(2,0,1).float()
        item['caption'] = self.captions[idx]

        return item

    def __len__(self):
      return len(self.captions)




In [10]:
#Build Image Encoder
class ImageEncoder(nn.Module):

  def __init__(self,image_model=Configure.image_model,pretrained = Configure.pretrained,trainable = Configure.trainable):
    self.model = timm.create_model(image_model,pretrained,global_pool='avg')

  def forward(self,x):
    return self.model(x)


In [9]:
#Build Text Encoder

class TextEncoder(nn.Module):

  def __init__(self,text_model = Configure.text_model,pretrained = Configure.pretrained):

    self.model = DistilBertModel.from_pretrained(text_model)
    self.target_token_idx = 0



  def forward(self,input_ids,attention_mask):
    output = self.model(input_ids = input_ids,attention_mask=attention_mask)
    last_hidden_state = output.last_hidden_state
    return last_hidden_state[:,self.target_token_idx,:]


In [13]:
#Bring both embeddings into same dimensions(Image and text)

class ProjectHead(nn.Module):

  def __init__(self,embedding_dim,projection_dim=Configure.projection_dim,dropout = Configure.dropout):
          self.projection = nn.Linear(embedding_dim,projection_dim) #embedding_dim is size of i/p vector(2048 for images and 768 for text) and projection_dim is o/p vector of size 256
          self.gelu = nn.GELU()
          self.fc = nn.Linear(projection_dim,projection_dim)
          self.dropout = nn.Dropout(dropout)
          self.layer_norm = nn.LayerNorm(projection_dim)

  def forward(self,x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x

In [17]:
class CLIPModel(nn.Module):

  def __init__(self,temperature=Configure.temperature,image_embedding = Configure.image_embedding,text_embedding = Configure.text_embedding):
    self.image_encoder = ImageEncoder()
    self.text_encoder = TextEncoder()
    self.image_projection = ProjectHead(embedding_dim=image_embedding)
    self.text_projection = ProjectHead(embedding_dim = text_embedding)
    self.temperature = temperature






  def forward():
    #Get image and text features
    image_features = self.image_encoder(batch['image'])
    text_features =  self.text_encoder(input_ids = batch['input_ids'],attention_mask = batch['attention_mask'])

    #Get image and text embeedings(with same dim)
    image_embeddings = self.image_projection(image_features)
    text_embeddings = self.text_projection(text_features)


    #calculate loss
    logits = (text_embeddings @ image_embeddings.T) / self.temperature
    image_similarity = image_embeddings @ image_embeddings.T
    text_similarity = text_similarity @text_similarity.T

    target = F.softmax((image_similarity + text_similarity)/2 * self.temperature,dim= -1)

    text_loss = cross_entropy(logits,target,reduction = 'None')
    image_loss = cross_entropy(logits.T,target.T,reduction = 'None')
    loss = image_loss + text_loss

    return loss.mean()



    def cross_entropy(preds,target,reduction='None'):
        log_softmax = nn.LogSoftMax(dim = -1)
        loss = -targets * log_softmax(preds).sum(1)
        if reduction == "None":
            return loss
        elif reduction == 'mean':
            return loss.mean()




