In [1]:
import json
import time
from tqdm import tqdm
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import logging
import matplotlib.pyplot as plt
from PIL import Image
import requests
from transformers import (
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    ViTFeatureExtractor,
    BertTokenizer,
    AutoTokenizer,
    AutoImageProcessor
)
logging.set_verbosity_error()



In [2]:
class CFG:

    max_text_tokens_length = 32
    text_backbone = 'bert-base-uncased'
    image_backbone = 'google/vit-base-patch16-224'

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    batch_size = 32
    max_epochs = 75
    max_bad_epochs = 2
    patience = 3
    factor = 0.1

In [4]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor
from torch.optim import Adam
from torch.nn import functional as F
import torch

# Mapping from class ID to class name
class_names = {
    0: 'DUMPSTER', 1: 'VEHICLE', 2: 'SKID_STEER', 3: 'EXCAVATOR', 4: 'VAN',
    5: 'LUMBER_BUNDLE', 6: 'CONE', 7: 'TRUCK', 8: 'GARBAGE_CONTAINER',
    9: 'LADDER', 10: 'POWER_GENERATOR', 11: 'TELESCOPIC_HANDLER',
    12: 'CONCRETE_BUCKET', 13: 'BOOMLIFT', 14: 'PLYWOOD', 15: 'TOILET_CABIN',
    16: 'FORMWORK_PROP_BUNDLE', 17: 'CONDUIT_ROLL', 18: 'FORMWORK_PANEL',
    19: 'CONCRETE_COLUMN', 20: 'PLATE_COMPACTOR', 21: 'TROWEL_POWER',
    22: 'SLAB_SLEEVES', 23: 'MINI_EXCAVATOR', 24: 'CONTAINER', 25: 'SCISSORLIFT',
    26: 'PICKUP_TRUCK', 27: 'MOBILE_CRANE', 28: 'EQUIPMENT', 29: 'TIEBACK_RIG',
    30: 'TOWER_CRANE', 31: 'CONCRETE_PUMP', 32: 'DRILLRIG', 33: 'LOADER',
    34: 'OFFICE_TRAILER', 35: 'DOZER', 36: 'BUS', 37: 'ROLLER', 38: 'CONCRETE_RIDE',
    39: 'BACKHOE_LOADER', 40: 'FORKLIFT', 41: 'GRADER', 42: 'HAND_ROLLER',
    43: 'HOIST_CABIN', 44: "UNKNOWN"
}

def convert_to_words(text):
    words = text.split('_')
    lowercase_words = [word.lower() for word in words]
    return ' '.join(lowercase_words)

class CustomDataset(Dataset):
    def __init__(self, image_dir, text_dir, processor, transform=None):
        self.image_dir = image_dir
        self.text_dir = text_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.jpeg')]
        self.processor = processor

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_filename = self.image_files[idx]
        image_path = os.path.join(self.image_dir, image_filename)
        text_filename = os.path.splitext(image_filename)[0] + '.txt'
        text_path = os.path.join(self.text_dir, text_filename)

        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load and parse annotations
        with open(text_path, 'r') as file:
            annotations = file.readlines()
        
        parsed_annotations = [list(map(float, line.strip().split())) for line in annotations]

        caption_indices = [44]
        if len(parsed_annotations) > 0:
            parsed_annotations.sort(key=lambda x: x[3] * x[4], reverse=True)
            top_annotations = parsed_annotations[:5]
            caption_indices = [int(ann[0]) for ann in top_annotations]
        
        captions = [convert_to_words(class_names[idx]) for idx in caption_indices]    
        caption = ' '.join(captions)
        encoded_pair = self.processor(text=[convert_to_words(caption)], images=[image], return_tensors="pt", max_length=CFG.max_text_tokens_length, padding='max_length', truncation=True)


        return encoded_pair
    
def collate_fn(batch):
  batch = list(filter(lambda x: x is not None, batch))
  return torch.utils.data.dataloader.default_collate(batch)



In [26]:
from torch.utils.data import DataLoader

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
feature_extractor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

image_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train'
text_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/labels/train'

dataset = CustomDataset(image_dir=image_dir, text_dir=text_dir, processor=processor, transform=transform)
train_dataloader = DataLoader(dataset,batch_size=CFG.batch_size, collate_fn=collate_fn)


In [18]:
#dataset[150]

In [27]:
def train_epoch(model, train_loader, optimizer, epoch, max_epochs):
    model.train()
    nb_batches = len(train_loader)
    tqdm_object = tqdm(train_loader, total=len(train_loader))   
    epoch_loss = 0.0
    for i, batch in enumerate(tqdm_object):
      outputs = model(
          input_ids=batch['input_ids'].squeeze().to(CFG.device),
          attention_mask=batch['attention_mask'].squeeze().to(CFG.device),
          pixel_values=batch['pixel_values'].squeeze().to(CFG.device),
          return_loss=True)
      loss, logits_per_image = outputs.loss, outputs.logits_per_image  # this is the image-text similarity score
      loss.backward()
      optimizer.step()
      tqdm_object.set_postfix(
          batch="{}/{}".format(i+1,nb_batches),
          train_loss=loss.item(),
          lr=get_lr(optimizer)
          )
    epoch_loss = epoch_loss / nb_batches
    return epoch_loss

def learning_loop(model):
    model.to(CFG.device)
    optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=CFG.patience, factor=CFG.factor)

    best_dev_score = float('inf')
    train_history = []
    dev_history = []
    nb_bad_epochs = 0

    print("Learning phase")
    print('Used device:', CFG.device)
    print("--------------")
    for epoch in range(1, CFG.max_epochs+1):

        print("Epoch {:03d}/{:03d}".format(epoch, CFG.max_epochs))

        if nb_bad_epochs >= CFG.max_bad_epochs:
            print("Epoch {:03d}/{:03d}: exiting training after too many bad epochs.".format(epoch, CFG.max_epochs))
            torch.save(model.state_dict(), "final.pt")
            break

        else:

            epoch_start_time = time.time()

            epoch_train_loss = train_epoch(model=model, train_loader=train_dataloader, optimizer=optimizer, epoch=epoch, max_epochs=CFG.max_epochs)
            #epoch_dev_score = valid_epoch(model=model, dev_loader=val_dataloader, epoch=epoch, max_epochs=CFG.max_epochs)
            duration = time.time() - epoch_start_time

            lr_scheduler.step(epoch_train_loss)

            train_history.append(epoch_train_loss)
            #dev_history.append(epoch_dev_score)

#             if epoch_dev_score < best_dev_score:
#                 nb_bad_epochs = 0
#                 best_dev_score = epoch_dev_score
#                 torch.save(model.state_dict(), "best.pt")
#                 print("Finished epoch {:03d}/{:03d} - Train loss: {:.7f} - Valid loss: {:.7f} - SAVED (NEW) BEST MODEL. Duration: {:.3f} s".format(
#                 epoch, CFG.max_epochs, epoch_train_loss, epoch_dev_score, duration))
#             else:
#                 nb_bad_epochs += 1
#                 print("Finished epoch {:03d}/{:03d} - Train loss: {:.7f} - Valid loss: {:.7f} - NUMBER OF BAD EPOCH.S: {}. Duration: {:.3f} s".format(
#                 epoch, CFG.max_epochs, epoch_train_loss, epoch_dev_score, nb_bad_epochs, duration))
    
#     history = {'train':train_history,'dev':dev_history}

def get_lr(optimizer):
    return 5e-5
    for param_group in optimizer.param_groups:
        return param_group["lr"]

In [28]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
feature_extractor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)

In [29]:
clip = VisionTextDualEncoderModel.from_vision_text_pretrained(CFG.image_backbone, CFG.text_backbone)
learning_loop(clip)

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [9]:
clip.save_pretrained("./vit-bert")
processor.save_pretrained("./vit-bert")

[]

In [9]:
from transformers import VisionTextDualEncoderModel, BertTokenizer
from PIL import Image
import torch 
from torchvision import transforms

# Load the saved model and processor
model = VisionTextDualEncoderModel.from_pretrained("./vit-bert")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
model.to(device)
#model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define a function for inference
def predict(image_path, caption):
    # Preprocess the image
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)

    # Tokenize the caption
    text_inputs = tokenizer(caption, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    input_ids = text_inputs.input_ids.to(device)
    attention_mask = text_inputs.attention_mask.to(device)

    with torch.no_grad():
        outputs = model(pixel_values=image, input_ids=input_ids, attention_mask=attention_mask)
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

    return logits_per_image, logits_per_text

# Example usage
image_path = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/val/20210325175112_production_2746262081.jpeg'
#image_path = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_002AFA69-A930-41C6-8982-20000E50EF97.jpeg'
captions = class_names.values()

logits_per_image, logits_per_text = predict(image_path, captions)
print("Logits per image:", logits_per_image)
print("Logits per text:", logits_per_text)

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [8]:
class_names.values()

dict_values(['DUMPSTER', 'VEHICLE', 'SKID_STEER', 'EXCAVATOR', 'VAN', 'LUMBER_BUNDLE', 'CONE', 'TRUCK', 'GARBAGE_CONTAINER', 'LADDER', 'POWER_GENERATOR', 'TELESCOPIC_HANDLER', 'CONCRETE_BUCKET', 'BOOMLIFT', 'PLYWOOD', 'TOILET_CABIN', 'FORMWORK_PROP_BUNDLE', 'CONDUIT_ROLL', 'FORMWORK_PANEL', 'CONCRETE_COLUMN', 'PLATE_COMPACTOR', 'TROWEL_POWER', 'SLAB_SLEEVES', 'MINI_EXCAVATOR', 'CONTAINER', 'SCISSORLIFT', 'PICKUP_TRUCK', 'MOBILE_CRANE', 'EQUIPMENT', 'TIEBACK_RIG', 'TOWER_CRANE', 'CONCRETE_PUMP', 'DRILLRIG', 'LOADER', 'OFFICE_TRAILER', 'DOZER', 'BUS', 'ROLLER', 'CONCRETE_RIDE', 'BACKHOE_LOADER', 'FORKLIFT', 'GRADER', 'HAND_ROLLER', 'HOIST_CABIN', 'UNKNOWN'])