In [1]:
import torch

cuda_available = torch.cuda.is_available()
device_name = torch.cuda.get_device_name(0) if cuda_available else "CPU"

print(f"CUDA Available: {cuda_available}")
print(f"Device: {device_name}")


CUDA Available: True
Device: NVIDIA GeForce RTX 4060 Laptop GPU


In [9]:
import torch
from ultralytics import YOLO
import cv2
from torchvision import models, transforms
from PIL import Image
import logging

# Setup logging for debugging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.info(f"Using device: {device}")

# Load custom-trained classification model (VGG16)
classifier_model = models.vgg16(pretrained=False)
classifier_model.classifier[6] = torch.nn.Linear(classifier_model.classifier[6].in_features, 10)  # 11 classes
classifier_model.load_state_dict(torch.load('car_classification_model.pth', map_location=device))
classifier_model.to(device)
classifier_model.eval()
logging.info("Loaded classification model and set to eval mode.")

# Define transformation pipeline for classification input
classification_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Map class indices to category names
category_mapping = {
    1: 'SUV mini',
    2: 'LCGC',
    3: 'MPV',
    4: 'SUV',
    5: 'Hatchback Subkompak',
    6: 'SUV',
    7: 'Hatchback',
    8: 'Compact SUV',
    9: 'Small MPV',
    10: 'Hatchback',
}

# Load YOLOv9 custom trained model for car detection
yolo_model = YOLO('yolov9c8_coco_car.pt')
logging.info("Loaded YOLOv9 detection model.")

# Define video capture source
cap = cv2.VideoCapture('traffic_test.mp4')
if not cap.isOpened():
    logging.error("Failed to open video source.")
    exit(1)

# Set up video writer to save the output video (MP4 format)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4 format
out = cv2.VideoWriter('output_video.mp4', fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))

# Helper function to safely crop image with bounds checking
def crop_image_safely(frame, x1, y1, x2, y2):
    h, w = frame.shape[:2]
    x1_c = max(0, min(x1, w - 1))
    x2_c = max(0, min(x2, w))
    y1_c = max(0, min(y1, h - 1))
    y2_c = max(0, min(y2, h))
    if x2_c <= x1_c or y2_c <= y1_c:
        return None
    return frame[y1_c:y2_c, x1_c:x2_c]

# Main processing loop
while True:
    ret, frame = cap.read()
    if not ret:
        logging.info("End of video stream or failed reading frame.")
        break

    # Perform object detection with YOLO, gets results for current frame
    results = yolo_model(frame)

    # Extract bounding boxes and confidence scores for detected cars
    detected_cars = results[0].boxes

    for car in detected_cars:
        # xywh format: center_x, center_y, width, height
        cx, cy, w, h = car.xywh[0].tolist()
        conf = car.conf[0].item()

        print(f"Confidence: {conf:.2f} | Bounding Box: ({x1}, {y1}), ({x2}, {y2}) type {car_label}")
        # Skip low confidence detections
        if conf < 0.7:
            continue

        # Convert center xywh to bounding box coordinates
        x1 = int(cx - w / 2)
        y1 = int(cy - h / 2)
        x2 = int(cx + w / 2)
        y2 = int(cy + h / 2)

        # Crop detected region safely
        cropped_car = crop_image_safely(frame, x1, y1, x2, y2)
        if cropped_car is None or cropped_car.size == 0:
            continue

        # Filter out very small detections
        if cropped_car.shape[0] < 5 or cropped_car.shape[1] < 5:
            continue

        # Convert to PIL Image and apply classification transforms
        pil_img = Image.fromarray(cv2.cvtColor(cropped_car, cv2.COLOR_BGR2RGB))
        input_tensor = classification_transform(pil_img).unsqueeze(0).to(device)

        # Run classification inference
        with torch.no_grad():
            outputs = classifier_model(input_tensor)
            _, predicted = torch.max(outputs, 1)
            predicted_class_idx = predicted.item() + 1  # label offset correction
            car_label = category_mapping.get(predicted_class_idx, 'Unknown')

        # Draw bounding box and label on frame
        label_text = f"{car_label} ({conf:.2f})"
        cv2.rectangle(frame, (x1, y1), (x2, y2), (76, 175, 80), 2)  # green box
        cv2.putText(frame, label_text, (x1, max(y1 - 10, 15)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (76, 175, 80), 2)

    # Write the processed frame to the output video file
    out.write(frame)

    # Display result
    cv2.imshow("Detection + Classification", frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()


2025-06-09 17:49:04,072 [INFO] Using device: cuda
2025-06-09 17:49:05,455 [INFO] Loaded classification model and set to eval mode.
2025-06-09 17:49:05,530 [INFO] Loaded YOLOv9 detection model.



0: 384x640 7 cars, 12.3ms
Speed: 1.6ms preprocess, 12.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
Confidence: 0.86 | Bounding Box: (158, 62), (274, 123) type Small MPV
Confidence: 0.81 | Bounding Box: (204, 45), (327, 113) type SUV mini
Confidence: 0.76 | Bounding Box: (1194, 120), (1274, 195) type SUV mini
Confidence: 0.58 | Bounding Box: (1085, 125), (1197, 214) type SUV mini
Confidence: 0.57 | Bounding Box: (1085, 125), (1197, 214) type SUV mini
Confidence: 0.57 | Bounding Box: (1085, 125), (1197, 214) type SUV mini
Confidence: 0.26 | Bounding Box: (1085, 125), (1197, 214) type SUV mini

0: 384x640 7 cars, 13.9ms
Speed: 0.9ms preprocess, 13.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)
Confidence: 0.85 | Bounding Box: (1085, 125), (1197, 214) type SUV mini
Confidence: 0.79 | Bounding Box: (205, 45), (328, 113) type SUV mini
Confidence: 0.74 | Bounding Box: (1194, 120), (1274, 196) type SUV mini
Confidence: 0.64 | Bounding Box: (1083, 