<a href="https://colab.research.google.com/github/himanshud2611/Machine-Learning/blob/main/FineTuning_CLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.2.3-py3-none-any.whl.metadata (7.8 kB)
Downloading ftfy-6.2.3-py3-none-any.whl (43 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.0/43.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.2.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-uzppbh8b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-uzppbh8b
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting n

In [3]:
import torch
import torchvision
import clip


##Preparing the Dataset

In [4]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# Load the dataset
dataset = datasets.Flowers102(root="./data", split="train", download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


Downloading https://thor.robots.ox.ac.uk/flowers/102/102flowers.tgz to data/flowers-102/102flowers.tgz


100%|██████████| 344862509/344862509 [00:13<00:00, 24782233.45it/s]


Extracting data/flowers-102/102flowers.tgz to data/flowers-102
Downloading https://thor.robots.ox.ac.uk/flowers/102/imagelabels.mat to data/flowers-102/imagelabels.mat


100%|██████████| 502/502 [00:00<00:00, 358450.90it/s]


Downloading https://thor.robots.ox.ac.uk/flowers/102/setid.mat to data/flowers-102/setid.mat


100%|██████████| 14989/14989 [00:00<00:00, 14435917.95it/s]


##Setting-Up CLIP Model
Load the pre-trained CLIP model and prepare it for fine-tuning:

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Set the model to training mode
model.train()


100%|███████████████████████████████████████| 338M/338M [00:22<00:00, 15.6MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

##Prepare Candidate Captions and Tokenize Them for training

In [6]:
candidate_captions = [
    "a close-up of a beautiful flower.",
    "a vibrant flower in bloom.",
    "a photo of a colorful flower.",
    "a delicate flower with petals.",
    "a single flower standing out.",
    "a flower with intricate patterns on its petals.",
    "a flower with bright yellow petals.",
    "a white flower with soft petals.",
    "a red flower in full bloom.",
    "a purple flower with a dark center.",
    "a bunch of flowers in a garden.",
    "a close-up of a flower with dew drops.",
    "a pink flower with lush green leaves.",
    "a flower in a field of wildflowers.",
    "a tropical flower with vibrant colors.",
    "a beautiful flower with a yellow center.",
    "a flower with large, bold petals.",
    "a soft and delicate white flower.",
    "a close-up of a blooming flower.",
    "a bright orange flower with green leaves.",
    "a small flower among tall grass.",
    "a flower surrounded by greenery.",
    "a flower with purple petals and yellow stamens.",
    "a close-up of a flower in the sunlight.",
    "a vibrant pink flower with a yellow center.",
    "a flower with multi-colored petals.",
    "a flower growing in the wild.",
    "a close-up of a flower against a blue sky.",
    "a cluster of flowers in various shades.",
    "a delicate flower with soft pink petals.",
    "a flower with pointed petals.",
    "a blue flower blooming in the garden.",
    "a large, bold flower with dark petals.",
    "a tiny flower hidden among leaves.",
    "a flower with orange petals and a yellow center.",
    "a blooming flower in a natural setting.",
    "a white flower with yellow stamens.",
    "a flower with vibrant red petals.",
    "a close-up of a flower covered in raindrops.",
    "a flower standing tall in the garden."
]

# Tokenize the captions
text_tokens = clip.tokenize(candidate_captions).to(device)

##Define the Training Loop
We need a loss function and optimizer to fine-tune the model. For simplicity, we'll use a cosine similarity loss, which aligns with how CLIP measures similarity between images and text:

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-7)
loss_function = torch.nn.CrossEntropyLoss()





# Training loop
for epoch in range(5):  # Adjust the number of epochs as needed
    total_loss = 0
    for batch in dataloader:
        images, labels = batch
        images = images.to(device)

        optimizer.zero_grad()

        # Normalize the image features without in-place operations
        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # Normalize the text features without in-place operations
        text_features = model.encode_text(text_tokens)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)


        # Calculate similarities between images and text
        logits_per_image = torch.matmul(image_features, text_features.T)

        # Calculate the loss
        targets = torch.arange(len(images)).to(device)
        loss = loss_function(logits_per_image, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)

        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")


Epoch 1, Loss: nan
Epoch 2, Loss: nan
Epoch 3, Loss: nan
Epoch 4, Loss: nan
Epoch 5, Loss: nan


In [18]:
import torch
import clip
from PIL import Image
from torchvision import transforms

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Prepare a function for generating captions
def generate_captions(image_path, text_features):
    # Load and preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)  # Add batch dimension
    image_features = model.encode_image(image)
    image_features /= image_features.norm(dim=-1, keepdim=True)

    # Compute similarities
    similarities = torch.matmul(image_features, text_features.T)
    best_caption_index = similarities.argmax(dim=-1).item()

    return candidate_captions[best_caption_index]

# Example usage
text_features = model.encode_text(clip.tokenize(candidate_captions).to(device))

image_path = "/content/data/flowers-102/jpg/image_00001.jpg"  # Path to the image you want to evaluate
caption = generate_captions(image_path, text_features)
print(f"Generated Caption: {caption}")


Generated Caption: a flower with large, bold petals.


In [19]:
torch.save(model.state_dict(), "fine_tuned_clip.pth")


In [20]:
import os
print(os.getcwd())


/content
