In [None]:
!pip install transformers torch torchvision
!pip install -U matplotlib
!pip install timm

In [None]:
#################################
###Initialize Dataset 
#################################

import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor
from torch.optim import Adam
from torch.nn import functional as F
import torch

class_names = {
    0: 'DUMPSTER', 1: 'VEHICLE', 2: 'SKID_STEER', 3: 'EXCAVATOR', 4: 'VAN',
    5: 'LUMBER_BUNDLE', 6: 'CONE', 7: 'TRUCK', 8: 'GARBAGE_CONTAINER',
    9: 'LADDER', 10: 'POWER_GENERATOR', 11: 'TELESCOPIC_HANDLER',
    12: 'CONCRETE_BUCKET', 13: 'BOOMLIFT', 14: 'PLYWOOD', 15: 'TOILET_CABIN',
    16: 'FORMWORK_PROP_BUNDLE', 17: 'CONDUIT_ROLL', 18: 'FORMWORK_PANEL',
    19: 'CONCRETE_COLUMN', 20: 'PLATE_COMPACTOR', 21: 'TROWEL_POWER',
    22: 'SLAB_SLEEVES', 23: 'MINI_EXCAVATOR', 24: 'CONTAINER', 25: 'SCISSORLIFT',
    26: 'PICKUP_TRUCK', 27: 'MOBILE_CRANE', 28: 'EQUIPMENT', 29: 'TIEBACK_RIG',
    30: 'TOWER_CRANE', 31: 'CONCRETE_PUMP', 32: 'DRILLRIG', 33: 'LOADER',
    34: 'OFFICE_TRAILER', 35: 'DOZER', 36: 'BUS', 37: 'ROLLER', 38: 'CONCRETE_RIDE',
    39: 'BACKHOE_LOADER', 40: 'FORKLIFT', 41: 'GRADER', 42: 'HAND_ROLLER',
    43: 'HOIST_CABIN', 44: "UNKNOWN"
}

def convert_to_words(text):
    words = text.split('_')
    lowercase_words = [word.lower() for word in words]
    return ' '.join(lowercase_words)

image_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train'
text_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/labels/train'

class CustomDataset(Dataset):
    def __init__(self, image_dir, text_dir):
        self.image_dir = image_dir
        self.text_dir = text_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.jpeg')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_filename = self.image_files[idx]
        image_path = os.path.join(self.image_dir, image_filename)
        text_filename = os.path.splitext(image_filename)[0] + '.txt'
        text_path = os.path.join(self.text_dir, text_filename)

        # Load and parse annotations
        with open(text_path, 'r') as file:
            annotations = file.readlines()
        image = Image.open(image_path).convert("RGB")
        parsed_annotations = [list(map(float, line.strip().split())) for line in annotations]

        caption_indices = [44]
        if len(parsed_annotations) > 0:
            parsed_annotations.sort(key=lambda x: x[3] * x[4], reverse=True)
            top_annotations = parsed_annotations[:5]
            caption_indices = [int(ann[0]) for ann in top_annotations]
        
        captions = [convert_to_words(class_names[idx]) for idx in caption_indices]    
        caption = ' '.join(captions)
        return {
            "image": image,
            "captions": caption,
            "image_path": image_path
        }

dataset = CustomDataset(image_dir=image_dir, text_dir=text_dir)


In [None]:
dataset[0]

In [None]:
#################################
### Calculat Accuracy, MRR, TopK 
#################################

import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image
import requests
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import time 
# Load the pre-trained model and image processor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

image_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/val'
text_dir = '/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/labels/val'

dataset = CustomDataset(image_dir=image_dir, text_dir=text_dir)

def calculate_metrics(dataset, model, processor, device, k=3, total=200):
    matched = 0
    total_samples = 0
    mrr_total = 0.0
    top_k_matched = 0
    total_inference_time = 0.0

    for i in range(total):
        original_captions = dataset[i]["captions"]
        image_path = dataset[i]["image_path"]
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt")
        inputs.to(device)
        
        start_time = time.time()
        outputs = model(**inputs)
        inference_time = time.time() - start_time
        total_inference_time += inference_time

        outputs = model(**inputs)

        # Convert the bounding boxes to the original image scale
        target_sizes = torch.tensor([image.size[::-1]])
        post_processed = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
        bboxes = post_processed["boxes"]
        labels = post_processed["labels"]
        scores = post_processed["scores"]

        # Calculate areas of bounding boxes
        areas = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1])
        
        predicted_captions = []
        if len(areas) > 0:
            top_k_indices = torch.topk(scores, min(k,len(areas))).indices.tolist()
            id2label = model.config.id2label
            predicted_captions = [id2label[labels[idx].item()] for idx in top_k_indices]
        
        # Calculate accuracy
        for match in predicted_captions:
            if match in original_captions: 
                matched += 1
                break

        # Calculate MRR
        reciprocal_rank = 0.0
        for rank, predicted_label in enumerate(predicted_captions, start=1):
            if predicted_label in original_captions:
                reciprocal_rank = 1.0 / rank
                break
        mrr_total += reciprocal_rank

        # Calculate Top-K accuracy
        if any(predicted_label in original_captions for predicted_label in predicted_captions[:k]):
            top_k_matched += 1

        total_samples += 1

        print(f"original: {original_captions} vs predicted {predicted_captions}")

    accuracy = matched / total_samples
    mrr = mrr_total / total_samples
    top_k_accuracy = top_k_matched / total_samples
    average_inference_time = total_inference_time / total_samples
    return accuracy, mrr, top_k_accuracy, average_inference_time

# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, mrr, top_k_accuracy, average_inference_time = calculate_metrics(dataset, model, processor, device, k=3, total=200)

print(f"Accuracy: {accuracy}")
print(f"Mean Reciprocal Rank (MRR): {mrr}")
print(f"Top-3 Accuracy: {top_k_accuracy}")
print(f"Average Inference Time: {average_inference_time} seconds")


In [None]:
#################################
### Perform Single inference 
#################################

import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image
import requests
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Load the pre-trained model and image processor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# Load an image
image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_001--Optima--09-02-2018-0631.jpg"
#image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_001--Optima--14-11-2017-5846.jpg"
#image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_002AFA69-A930-41C6-8982-20000E50EF97.jpeg"
image_path = "/mnt/data/ypatel/ObjectDetection/Dataset/Dataset_D8/images/train/01_003--Optima--12-08-2017.jpg"
image = Image.open(image_path)

# Prepare the image for the model
inputs = processor(images=image, return_tensors="pt")

# Perform inference
outputs = model(**inputs)

# Extract outputs
logits = outputs.logits
bboxes = outputs.pred_boxes

# Get the probabilities and the predicted class labels
probs = logits.softmax(-1)[0, :, :-1]

# Convert the bounding boxes to the original image scale
target_sizes = torch.tensor([image.size[::-1]])
post_processed = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
post_processed


In [None]:

# Visualize the image with bounding boxes
fig, ax = plt.subplots(1, figsize=(16, 10))
ax.imshow(image)

# Define the threshold for detection
threshold = 0.9

for score, label, box in zip(post_processed['scores'], post_processed['labels'], post_processed['boxes']):
    if score >= threshold:
        xmin, ymin, xmax, ymax = box.tolist()
        width, height = xmax - xmin, ymax - ymin

        # Create a rectangle patch
        rect = patches.Rectangle((xmin, ymin), width, height, linewidth=2, edgecolor='r', facecolor='none')
        
        # Add the rectangle to the plot
        ax.add_patch(rect)
        ax.text(xmin, ymin, f"{model.config.id2label[label.item()]}: {score:.2f}", fontsize=12, bbox=dict(facecolor='yellow', alpha=0.5))

plt.axis("off")
plt.show()