For yolov3 you can prefer: 1- https://pjreddie.com/media/files/papers/YOLOv3.pdf and 2- https://pjreddie.com/darknet/yolo/

In [None]:
# Que 1

!git clone https://github.com/pjreddie/darknet
%cd darknet
!make

In [None]:
!wget https://pjreddie.com/media/files/yolov3.weights

In [None]:
!./darknet detect cfg/yolov3.cfg yolov3.weights /kaggle/input/cat-and-dogs/dataset/test_set/cats/cat.4004.jpg

Implementing the above with open-cv

In [None]:
!pip install opencv-python numpy

In [None]:
!wget https://pjreddie.com/media/files/yolov3.weights 
!wget https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg 
!wget https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names 

In [None]:
import cv2
import numpy as np

# Load YOLOv3 model
net = cv2.dnn.readNet("/kaggle/working/yolov3.weights", "/kaggle/working/yolov3.cfg") 

# Load COCO class labels
with open("/kaggle/working/coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

# Get layer names
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

In [None]:
# Load image
image = cv2.imread("/kaggle/input/cat-and-dogs/dataset/test_set/cats/cat.4003.jpg")
height, width = image.shape[:2]

# Convert image to blob
blob = cv2.dnn.blobFromImage(image, scalefactor=1/255.0, size=(416, 416), swapRB=True, crop=False)
net.setInput(blob)

# Run YOLO model
layer_outputs = net.forward(output_layers)

# Process detections
boxes, confidences, class_ids = [], [], []

for output in layer_outputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]

        if confidence > 0.5:
            center_x, center_y, w, h = (detection[:4] * np.array([width, height, width, height])).astype("int")
            x, y = int(center_x - w / 2), int(center_y - h / 2)
            
            boxes.append([x, y, int(w), int(h)])
            confidences.append(float(confidence))
            class_ids.append(class_id)

# Draw bounding boxes without NMS
for i in range(len(boxes)):
    x, y, w, h = boxes[i]
    label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
    color = (0, 255, 0)  # Green box
    cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
    cv2.putText(image, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Show output image
import matplotlib.pyplot as plt
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis('off')  # Hide axis
plt.show()

# Applying non maximum suppression

In [None]:
# Load image
image = cv2.imread("/kaggle/input/cat-and-dogs/dataset/test_set/cats/cat.4003.jpg")
height, width = image.shape[:2]

# Convert image to blob
blob = cv2.dnn.blobFromImage(image, scalefactor=1/255.0, size=(416, 416), swapRB=True, crop=False)
net.setInput(blob)

# Run YOLO model
layer_outputs = net.forward(output_layers)

# Process detections
boxes, confidences, class_ids = [], [], []

for output in layer_outputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]

        if confidence > 0.5:
            center_x, center_y, w, h = (detection[:4] * np.array([width, height, width, height])).astype("int")
            x, y = int(center_x - w / 2), int(center_y - h / 2)
            
            boxes.append([x, y, int(w), int(h)])
            confidences.append(float(confidence))
            class_ids.append(class_id)

# Draw bounding boxes without NMS
# Apply Non-Maximum Suppression (NMS)
indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)

# Removes overlapping boxes to keep only the best ones:
# score_threshold=0.5 → Keep detections with confidence > 50%.
# nms_threshold=0.4 → If two boxes overlap >40%, keep the one with the highest confidence.

# Draw bounding boxes
if len(indices) > 0:
    for i in indices.flatten():
        x, y, w, h = boxes[i]
        label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
        color = (0, 255, 0)  # Green for bounding boxes
        cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
        cv2.putText(image, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Show the image
import matplotlib.pyplot as plt
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis('off')  # Hide axis
plt.show()

Que Why NMS is important ?

Ans- Non-Maximum Suppression (NMS) is essential in object detection to eliminate redundant and overlapping bounding boxes. During detection, multiple bounding boxes may be predicted for a single object, often with different confidence scores. NMS helps by keeping the box with the highest confidence score and suppressing others that have a significant overlap (usually defined by the Intersection over Union, IoU threshold). This ensures that each object is detected with only one bounding box, reducing false positives and improving the precision and efficiency of the detection system. Without NMS, multiple detections for the same object can degrade the model’s performance.

## Que3 Training on Custom dataset

In [None]:
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5 
!pip install -r requirements.txt

In [None]:
import yaml
with open("/kaggle/input/yaml-file/data.yaml", 'r') as f:
    dataset_yaml = yaml.safe_load(f)
print(dataset_yaml)

In [None]:
#Train the model

!python train.py --img 640 --batch 8 --epochs 100 --data /kaggle/input/yaml-file/data.yaml --weights yolov5s.pt --cache --device 0

Accuracy = 2*p*r/p+r

In [None]:
!python val.py --weights runs/train/exp5/weights/best.pt --data /kaggle/input/yaml-file/data.yaml --img 640


In [None]:
import cv2
import torch
from matplotlib import pyplot as plt
from PIL import Image

# Run YOLOv5 detection
!python detect.py --weights runs/train/exp5/weights/best.pt --img 640 --conf 0.5 --source /kaggle/input/yolov5-custom-dataset/test/images/cat_864_jpg.rf.a511d6f078a24ff41971e84a8a5d9922.jpg

# Load and display the output image
result_path = "runs/detect/exp/cat_864_jpg.rf.a511d6f078a24ff41971e84a8a5d9922.jpg"

# Read and display the image using OpenCV
image = cv2.imread(result_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for proper display

# Display using Matplotlib
plt.figure(figsize=(8, 6))
plt.imshow(image)
plt.axis("off")  # Hide axes
plt.show()


## Que 4 On live video feed

In [None]:
import cv2
import torch
from yolov5 import detect  # Ensure YOLOv5 is in your working directory

# Load YOLOv5 model
model_path = "runs/train/exp5/weights/best.pt"  # Path to your trained weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Open webcam (0 for default webcam)
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit if no frame is captured

    # Save the frame temporarily
    cv2.imwrite("temp.jpg", frame)

    # Run YOLO detection
    detect.run(weights=model_path, source="temp.jpg", conf_thres=0.5, save_txt=False, save_conf=False)

    # Load the processed image with detections
    result_img = cv2.imread("runs/detect/exp/temp.jpg")
    cv2.imshow("YOLOv5 Live Detection", result_img)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


Que 5

In [None]:
import torch
import torchvision
from torchvision.transforms import functional as F
import cv2
import matplotlib.pyplot as plt
from PIL import Image

# Load Faster R-CNN pre-trained model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Load COCO class names
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
    'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe',
    'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
    'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork',
    'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet',
    'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Load and preprocess image
image_path = "/kaggle/input/yolov5-custom-dataset/test/images/cat_910_jpg.rf.c61d1b561b0da07cf271edecd4b6f6d5.jpg"
image = Image.open(image_path).convert("RGB")
image_tensor = F.to_tensor(image).unsqueeze(0)  # Convert to tensor and add batch dimension

# Perform inference
with torch.no_grad():
    predictions = model(image_tensor)

# Extract detections
boxes = predictions[0]['boxes']
labels = predictions[0]['labels']
scores = predictions[0]['scores']

# Convert image for OpenCV
image_cv = cv2.imread(image_path)
image_cv = cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB)

# Draw detections on image
for i in range(len(boxes)):
    if scores[i] > 0.5:  # Confidence threshold
        box = boxes[i].numpy().astype("int")
        label = COCO_INSTANCE_CATEGORY_NAMES[labels[i]]  # Convert label index to name
        confidence = scores[i].item()

        # Draw bounding box
        cv2.rectangle(image_cv, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
        cv2.putText(image_cv, f"{label}: {confidence:.2f}", (box[0], box[1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

# Show the image with detections
plt.figure(figsize=(8, 6))
plt.imshow(image_cv)
plt.axis("off")
plt.show()


# Que 6

In [None]:
!pip install torch torchvision pycocotools

In [None]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.datasets import CocoDetection
from torchvision import transforms
import os


In [None]:
# Loading the dataset

class CustomCocoDataset(CocoDetection):
    def __init__(self, img_folder, ann_file, transforms=None):
        super().__init__(img_folder, ann_file)
        self.transforms = transforms

    def __getitem__(self, idx):
        img, target = super().__getitem__(idx)
        if self.transforms:
            img = self.transforms(img)
        return img, target

transform = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = CustomCocoDataset(
    img_folder='/kaggle/input/catanddog-dataset-coco-format/train/images',
    ann_file='/kaggle/input/catanddog-dataset-coco-format/train/_annotations.coco.json',
    transforms=transform
)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))


In [None]:
# Load pre-trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Modify the classifier for two classes (Background + 2 classes: Cat & Dog)
num_classes = 3  # Background + Cat + Dog
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



In [None]:
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v for k, v in t.items()} for t in targets]  # Ensure targets are correctly formatted

        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

print("Training complete! ✅")


Faster R-CNN requires images along with bounding box annotations in Pascal VOC (XML) or COCO (JSON) format. Since I am using YOLO format (TXT), I need to convert it to the required format.

In [None]:
import xml.etree.ElementTree as ET

def yolo_to_voc(yolo_label, img_width, img_height):
    class_id, x_center, y_center, width, height = map(float, yolo_label)
    x_center *= img_width
    y_center *= img_height
    width *= img_width
    height *= img_height
    
    xmin = int(x_center - width / 2)
    ymin = int(y_center - height / 2)
    xmax = int(x_center + width / 2)
    ymax = int(y_center + height / 2)
    
    return xmin, ymin, xmax, ymax


Now, let’s define a Custom Dataset Class for Faster R-CNN:

In [None]:
class CustomDataset(Dataset):
    def __init__(self, image_dir, label_dir, transforms=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
        self.label_files = [f.replace('.jpg', '.txt') for f in self.image_files]
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        img = Image.open(img_path).convert("RGB")
        img_width, img_height = img.size

        # Load annotations
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        boxes = []
        labels = []

        with open(label_path, "r") as f:
            for line in f:
                label_data = line.strip().split()
                class_id = int(label_data[0]) + 1  # Pascal VOC index starts from 1
                xmin, ymin, xmax, ymax = yolo_to_voc(label_data[1:], img_width, img_height)
                
                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(class_id)

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        target = {"boxes": boxes, "labels": labels}

        if self.transforms:
            img = self.transforms(img)

        return img, target

# Apply image transformations
transform = transforms.Compose([transforms.ToTensor()])

# Load datasets
train_dataset = CustomDataset("/kaggle/input/yolov5-custom-dataset/train/images", "/kaggle/input/yolov5-custom-dataset/train/labels", transforms=transform)
val_dataset = CustomDataset("/kaggle/input/yolov5-custom-dataset/valid/images", "/kaggle/input/yolov5-custom-dataset/valid/labels", transforms=transform)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))


In [None]:
# Loading faster rcnn model 

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Load pre-trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Modify classifier to match the number of classes in our dataset
num_classes = 3  # (background, cat, dog)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
# Training

import torch.optim as optim

# Set optimizer and learning rate
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

# Define the training function
def train_model(model, train_loader, val_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for images, targets in train_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

train_model(model, train_loader, val_loader, num_epochs=10)


In [None]:
def __getitem__(self, idx):
    ...
    with open(label_path, "r") as f:
        for line in f:
            label_data = line.strip().split()
            print(f"DEBUG: Label data -> {label_data}")  # Debug print

            if len(label_data) != 5:
                print(f"ERROR: Invalid label format in {label_path}")
                continue  # Skip the faulty label
            
            class_id = int(label_data[0]) + 1
            xmin, ymin, xmax, ymax = yolo_to_voc(label_data[1:], img_width, img_height)
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_id)


## End! Thank You....