In [None]:
!pip install datasets accelerate matplotlib -U
!pip install torch torchvision pillow
!pip install wandb onnx -Uq

In [None]:
#!pip install openai-clip
!pip install git+https://github.com/openai/CLIP.git
!pip install ftfy regex tqdm
!python -m pip install setuptools==69.5.1

In [None]:
import json
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import clip
from transformers import CLIPProcessor, CLIPModel


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

In [None]:
#image = processor(Image.open('/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/val/2018-11-22 11.02.10.jpg'))

In [None]:
# Load pre-trained CLIP model
clip.available_models()



In [None]:
clip.tokenize("My name is antony")[0]

In [None]:
###############################
##### CLIP Training Dataset 
###############################
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor
from torch.optim import Adam
from torch.nn import functional as F
import torch
import random

# Mapping from class ID to class name
class_names = {
    0: 'DUMPSTER', 1: 'VEHICLE', 2: 'SKID_STEER', 3: 'EXCAVATOR', 4: 'VAN',
    5: 'LUMBER_BUNDLE', 6: 'CONE', 7: 'TRUCK', 8: 'GARBAGE_CONTAINER',
    9: 'LADDER', 10: 'POWER_GENERATOR', 11: 'TELESCOPIC_HANDLER',
    12: 'CONCRETE_BUCKET', 13: 'BOOMLIFT', 14: 'PLYWOOD', 15: 'TOILET_CABIN',
    16: 'FORMWORK_PROP_BUNDLE', 17: 'CONDUIT_ROLL', 18: 'FORMWORK_PANEL',
    19: 'CONCRETE_COLUMN', 20: 'PLATE_COMPACTOR', 21: 'TROWEL_POWER',
    22: 'SLAB_SLEEVES', 23: 'MINI_EXCAVATOR', 24: 'CONTAINER', 25: 'SCISSORLIFT',
    26: 'PICKUP_TRUCK', 27: 'MOBILE_CRANE', 28: 'EQUIPMENT', 29: 'TIEBACK_RIG',
    30: 'TOWER_CRANE', 31: 'CONCRETE_PUMP', 32: 'DRILLRIG', 33: 'LOADER',
    34: 'OFFICE_TRAILER', 35: 'DOZER', 36: 'BUS', 37: 'ROLLER', 38: 'CONCRETE_RIDE',
    39: 'BACKHOE_LOADER', 40: 'FORKLIFT', 41: 'GRADER', 42: 'HAND_ROLLER',
    43: 'HOIST_CABIN'
}

class CustomDataset(Dataset):
    def __init__(self, image_dir, text_dir):
        self.image_dir = image_dir
        self.text_dir = text_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.jpeg')]
        
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_filename = self.image_files[idx]
        image_path = os.path.join(self.image_dir, image_filename)
        text_filename = os.path.splitext(image_filename)[0] + '.txt'
        text_path = os.path.join(self.text_dir, text_filename)


        # Load and parse annotations
        with open(text_path, 'r') as file:
            annotations = file.readlines()
        
        parsed_annotations = [list(map(float, line.strip().split())) for line in annotations]

        # Find the annotation with the largest area
        correct_caption = "unknown"
        if len(parsed_annotations) > 0:
            parsed_annotations.sort(key=lambda x: x[3] * x[4], reverse=True)
            top_annotations = parsed_annotations[:2]
            caption_indices = [int(ann[0]) for ann in top_annotations]
            correct_captions = [class_names[idx] for idx in caption_indices]
            correct_caption = ' '.join(correct_captions)

        
        image = preprocess(Image.open(image_path))
        correct_caption_tensor = clip.tokenize(correct_caption).squeeze()
        return image, correct_caption_tensor, image_path, correct_caption

image_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train'
text_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/labels/train'

dataset = CustomDataset(image_dir=image_dir, text_dir=text_dir)

# To use with a DataLoader
from torch.utils.data import DataLoader


dataloader = DataLoader(dataset, batch_size=50)

# # Example of iterating through the dataset
for batch in dataloader:
    #print(batch)
    break


In [None]:
torch.arange(10,dtype=torch.long,device=device)

In [None]:
dataset[600]

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
import wandb
wandb.login()


In [None]:
###############################
##### Contrastive Training  
###############################

def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        if p.requires_grad:
          p.grad.data = p.grad.data.float() 
        
        
import torch
from torch import optim
from tqdm import tqdm

# Define a simple training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
model.to(device)
clip.model.convert_weights(model)
model.eval()

learning_rate=1e-5
weight_decay=0.001
betas=(0.9,0.98)
optimizer = optim.Adam(model.parameters(), lr=learning_rate,betas=betas,eps=1e-6,weight_decay=weight_decay) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

num_epochs = 5
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()


########Log to Wandb
config = dict(epochs=num_epochs,learning_rate=learning_rate, weight_decay= weight_decay, betas= betas)
wandb.init(project="open-ai-clip-finetuning", config=config)
wandb.watch(model, optimizer, log="all", log_freq=10)

for epoch in range(num_epochs):
    total_loss = 0
    
    tqdm_object = tqdm(dataloader, total=len(dataloader))
    nb_batches = len(dataloader)
    for i, batch in enumerate(tqdm_object):
        optimizer.zero_grad()
        images, texts, _, _ = batch
        
        # Move images and texts to the specified device (CPU or GPU)
        images = images.to(device)
        texts = texts.to(device)
        
        logits_per_image, logits_per_text = model(images, texts)
        
        # Compute loss
        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2

        # Backward pass
        total_loss.backward()
        
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

        tqdm_object.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")
        wandb.log({"epoch": epoch, "loss": total_loss})
        
    average_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}")

print("Training complete.")

save_path = "./clip-finetune"
torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"{save_path}") #just change to your preferred folder/filename
print("Model Saved")

In [None]:
save_path = "./clip-finetune"
torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"{save_path}") #just change to your preferred folder/filename

In [None]:
import clip
import torch 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

save_path = "./clip-finetune"
model, preprocess = clip.load("ViT-B/32",device=device) #Must set jit=False for training
checkpoint = torch.load(save_path)

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
# checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
# checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
# checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])
model

In [None]:
#################################
### Single Image
#################################
import requests
from PIL import Image 

image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_001--Optima--09-02-2018-0631.jpg"
image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_001--Optima--14-11-2017-5846.jpg"
#image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_002AFA69-A930-41C6-8982-20000E50EF97.jpeg"
#image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_003--Optima--12-08-2017.jpg"

image = Image.open(image_path).convert('RGB')
smaller_image = image.resize((int(image.width / 2), int(image.height / 2)))

# Display the smaller image
display(smaller_image)

In [None]:
###############################
#### Inference with Contrastive 
###############################
from transformers import VisionTextDualEncoderModel, BertTokenizer
from PIL import Image
import torch 
from torchvision import transforms

# Load the saved model and processor
# model = CLIPModel.from_pretrained(save_path)
# processor = CLIPProcessor.from_pretrained(save_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Define a function for inference
def predict(image_path, captions):
    # Preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text = clip.tokenize(captions).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        logits_per_image, logits_per_text = model(image, text)

    return logits_per_image, logits_per_text

# Example usage
#image_path = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/val/20210325175112_production_2746262081.jpeg'
#image_path = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_002AFA69-A930-41C6-8982-20000E50EF97.jpeg'
captions = list(class_names.values())

logits_per_image, logits_per_text = predict(image_path, captions)
correct_probs = F.softmax(logits_per_image, dim=-1)
print(logits_per_image)
#correct_label = torch.argmax(correct_probs, dim=-1).item()
correct_label_indices =torch.topk(correct_probs, 5).indices.squeeze().tolist()
correct_classes = [class_names[i] for i in correct_label_indices]

print(correct_classes)


In [None]:
torch.topk(correct_probs, 5).indices.squeeze().tolist()

In [None]:
#################################
### Calculate Accuracy, MRR and Top K 
#################################
from tqdm.notebook import tqdm
from PIL import Image
from torch.utils.data import DataLoader
import time 
# Assuming CustomImageTextDataset, generate_caption, model, and tokenizer are defined elsewhere

image_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/val'
text_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/labels/val'

val_dataset = CustomDataset(image_dir=image_dir, text_dir=text_dir)

# Function to calculate metrics
def calculate_metrics(val_dataset, model, k=3, total=200):
    matched = 0
    total_samples = 0
    mrr_total = 0.0
    top_k_matched = 0
    total_inference_time = 0.0

    for i in range(total):
        original_labels = val_dataset[i][3]
        image_path = val_dataset[i][2]
        image = Image.open(image_path)

        captions = list(class_names.values())

        start_time = time.time()
        logits_per_image, logits_per_text = predict(image_path, captions)
        inference_time = time.time() - start_time
        total_inference_time += inference_time

        correct_probs = F.softmax(logits_per_image, dim=-1)
        #correct_label = torch.argmax(correct_probs, dim=-1).item()
        correct_label_indices =torch.topk(correct_probs, 3).indices.squeeze().tolist()
        predicted_labels = [class_names[i] for i in correct_label_indices]

        # Calculate accuracy
        for clazz in predicted_labels:
            if clazz in original_labels: 
                matched += 1
                break

        # Calculate MRR
        reciprocal_rank = 0.0
        for rank, predicted_label in enumerate(predicted_labels, start=1):
            if predicted_label in original_labels:
                reciprocal_rank = 1.0 / rank
                break
        mrr_total += reciprocal_rank

        # Calculate Top-K
        if any(predicted_label in original_labels for predicted_label in predicted_labels[:k]):
            top_k_matched += 1

        total_samples += 1

        print(f"original: {original_labels} vs predicted {predicted_labels}")

    accuracy = matched / total_samples
    mrr = mrr_total / total_samples
    top_k_accuracy = top_k_matched / total_samples
    average_inference_time = total_inference_time / total_samples
    return accuracy, mrr, top_k_accuracy, average_inference_time

# Example usage
accuracy, mrr, top_k_accuracy, average_inference_time = calculate_metrics(val_dataset, model, k=5, total=200)

print(f"Accuracy: {accuracy}")
print(f"Mean Reciprocal Rank (MRR): {mrr}")
print(f"Top-5 Accuracy: {top_k_accuracy}")
print(f"Average Inference Time: {average_inference_time} seconds")


In [None]:
def model_outputs(image_path, captions):
    # Preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text = clip.tokenize(captions).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
        print(image_features)
        print(text_features)
        print(image_features * text_features)
        return model(image, text)
    
captions = list(class_names.values())
outputs = model_outputs(image_path, captions)

