# Load Dependencies

In [1]:
from dotenv import load_dotenv

load_dotenv()
from transformers import AutoModelForObjectDetection, AutoImageProcessor, pipeline

import os
from tqdm import tqdm
import torch.nn as nn
import torch
from torchvision.transforms import v2
import torchvision.models as models
from pathlib import Path

from PIL import Image, ImageDraw, ImageFont
import json
import numpy as np
from transformers import pipeline

# import custom file

from source_code.processor import vid_img_converter as v2img

In [2]:
def crop_objects_from_image(image: Image.Image, labels: list, padding: int = 10):
    """
    Crops objects from an image using bounding boxes with optional padding.
    
    Args:
        image (PIL.Image): The source image.
        labels (list): List of dicts with 'box' key containing 'xmin', 'ymin', 'xmax', 'ymax'.
        padding (int): Number of pixels to pad around the bounding box.
        
    Returns:
        List[PIL.Image]: Cropped image patches.
    """
    width, height = image.size
    cropped_images = []

    for obj in labels:
            box = obj['box']
            xmin = max(box['xmin'] - padding, 0)
            ymin = max(box['ymin'] - padding, 0)
            xmax = min(box['xmax'] + padding, width)
            ymax = min(box['ymax'] + padding, height)

            cropped = image.crop((xmin, ymin, xmax, ymax))
            cropped_images.append(cropped)

    return cropped_images

def load_model(checkpoint_path, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
    # LOAD THE CLASSIFIER MODEL
    model = models.mobilenet_v3_large()
    model.classifier[3] = nn.Linear(in_features=1280, out_features=num_classes)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    return model

font = ImageFont.truetype("./ARIAL.TTF",size=20)
def plot_results(image, results, threshold=0.6):
    image = Image.fromarray(np.uint8(image))
    draw = ImageDraw.Draw(image)
    width, height = image.size

    for result in results:
        score = result["score"]
        label = result["label"]
        box = list(result["box"].values())
        if label in ['motorcycle','autorickshaw']:
            outline_color = 'red'
        else:
            outline_color = 'yellow'
        if score > threshold:
            x1, y1, x2, y2 = tuple(box)
            draw.rectangle((x1, y1, x2, y2), outline=outline_color, width=3)
            draw.text((x1 + 5, y1 - 20), label, fill="white",font=font)
            draw.text((x1 + 5, y1 + 10), f"{score:.2f}", fill="green" if score > 0.7 else "red")

    return image

def infer_image(model, image: Image.Image, device='cuda' if torch.cuda.is_available() else 'cpu'):
    transform_x = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToTensor(),
    v2.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    class_names={
        0: 'Coupe',
        1: 'Sedan',
        2: 'Cab',
        3: 'Convertible',
        4: 'SUV',
        5: 'Minivan',
        6: 'Hatchback',
        7: 'Other',
        8: 'Van',
        9: 'Wagon'
    }
    input_tensor = transform_x(image).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = output.argmax(dim=1).item()

    return class_names[predicted_class]

# 1. Video -> Image frames
- set the `VIDEO_GDRIVE_ID` on the `.env` file
-  download the video using `gdown` and save if to `data/test_video.mp4`
- convert video to images in folder `"data/frame_dir"`

In [None]:
vid_id = os.environ.get("VIDEO_GDRIVE_ID")
vid_path = 'data/test_video.mp4'
v2img.gdown.download(id=vid_id, output=vid_path, quiet=False)

In [None]:
vid2img.video_to_images(vid_path,"data/frame_dir/")

# 2. Object Detection on Image Frames

In [7]:
image_folder = "./data/frame_dir"
json_output_folder = "./output/frame_json"
batch_size = 8  # adjust as needed

In [8]:
# 🧪 Create output folder
os.makedirs(json_output_folder, exist_ok=True)

# 📦 Load pipeline

# Load processor and model manually
processor = AutoImageProcessor.from_pretrained("izzako/detr-resnet-50-finetuned-IDD_Detection")
model = AutoModelForObjectDetection.from_pretrained(
    "izzako/detr-resnet-50-finetuned-IDD_Detection",
    torch_dtype="auto",          # Let HF decide best dtype
    device_map="auto"            # Avoid meta tensor issues
)

# Build the pipeline
obj_detector = pipeline(
    "object-detection",
    model=model,
    image_processor=processor,
    # device=0  # or -1 for CPU
)


Device set to use cuda:0


In [9]:
# 📂 Load image paths
image_paths = sorted(list(Path(image_folder).glob("*.jpg")))


# 🔁 Batch inference and save to JSON
for i in tqdm(range(0, len(image_paths), batch_size),):
    
    batch_paths = image_paths[i:i+batch_size]
    batch = [Image.open(p).convert("RGB") for p in batch_paths]
    results = obj_detector(batch)

    for path, prediction in zip(batch_paths, results):
        filename = Path(path).stem + ".json"
        output_path = Path(json_output_folder) / filename

        # 📝 Save prediction to JSON
        with open(output_path, "w") as f:
            json.dump(prediction, f, indent=2)

        # print(f"✅ Saved: {output_path}")

  0%|          | 0/740 [00:00<?, ?it/s]

  1%|▏         | 10/740 [00:04<06:06,  1.99it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 740/740 [05:46<00:00,  2.13it/s]


# 3. Car Classifier on Detected Bounding Box

In [3]:
image_folder = "./data/frame_dir"
detection_folder = "./output/frame_json"
batch_size = 8  # adjust as needed

model = load_model("./output/mobilenet_v3_large_checkpoint_10.pth", 10)

labeled_folder =  "./output/frame_label"
os.makedirs(labeled_folder,exist_ok=True)


label_folder = "./output/frame_label"
rec_plotted_img =  "./output/pred_image"

In [12]:
# RENAME DETECTED IMAGE TO CAR TYPE USING CLASSIFIER

for j,label in enumerate(tqdm(sorted(os.listdir(detection_folder)))):
    results = json.load(open(os.path.join(detection_folder,label),'r'))
    image = Image.open(os.path.join(image_folder,sorted(os.listdir(image_folder))[j]))
    cropped_imgs = crop_objects_from_image(image=image, labels=results, padding=20)
    for i,result in enumerate(results):
        if result['label']!='car':continue
        pred_label = infer_image(model, cropped_imgs[i])
        result['label']=pred_label

    with open(os.path.join(labeled_folder,label), "w") as f:
        json.dump(results, f, indent=2)



100%|██████████| 5920/5920 [02:48<00:00, 35.16it/s]


In [4]:
# PLOT THE CLASSIFIER RESULT TO IMAGES WITH BOUNDING BOX
os.makedirs(rec_plotted_img,exist_ok=True)
for i in tqdm(range(len(os.listdir(label_folder)))):
    image_name=sorted(os.listdir(image_folder))[i]
    image = Image.open(os.path.join(image_folder,image_name))
    results = json.load(open(os.path.join(label_folder,sorted(os.listdir(label_folder))[i]),'r'))
    plot_results(image, results, threshold=0.5).save(os.path.join(rec_plotted_img,image_name))

100%|██████████| 5920/5920 [03:13<00:00, 30.66it/s]


# 4. Predicted Frames to Video

In [6]:
v2img.images_to_video('output/pred_image', 'output/output_video.mp4')

Creating video of 197.00 seconds at 30.05 FPS.


100%|██████████| 5920/5920 [00:55<00:00, 107.26it/s]

Video saved to output/output_video.mp4



