# EE 769 Project: Object Detection using Faster R-CNN

Students: *Kasi Reddy Sreeman Reddy* (190070029), *Jahnavi Devangula* (20d070025) and *Kshitiz Susawat* (19D070030)

In this project we demonstrate how to use TensorFlow to perform object detection on images. It involves building a deep learning model based on the Faster R-CNN architecture and training it on the Pascal VOC dataset. The trained model is then used to make predictions on new images and mark the detected objects with bounding boxes and class labels. The project provides a step-by-step guide on how to prepare the dataset, train the model, and use it for object detection on new images.

Dataset links:
- http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
- http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar

**Note**: Originally our project was "Oriented R-CNN for Object Detection" but due to lack of time we did a simpler one.

In [1]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import cv2
import xml.etree.ElementTree as ET

!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar -xf VOCtrainval_06-Nov-2007.tar
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
!tar -xf VOCtest_06-Nov-2007.tar

# Constants
VOC_DIR = "VOCdevkit/VOC2007"
IMAGE_DIR = os.path.join(VOC_DIR, "JPEGImages")
ANNOTATION_DIR = os.path.join(VOC_DIR, "Annotations")
IMAGE_SETS_DIR = os.path.join(VOC_DIR, "ImageSets/Main")
CLASSES = ["aeroplane", "bicycle", "bird", "boat", "bottle",
           "bus", "car", "cat", "chair", "cow", "diningtable",
           "dog", "horse", "motorbike", "person", "pottedplant",
           "sheep", "sofa", "train", "tvmonitor"]
NUM_CLASSES = len(CLASSES)
BATCH_SIZE = 32

--2023-04-29 18:20:48--  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460032000 (439M) [application/x-tar]
Saving to: ‘VOCtrainval_06-Nov-2007.tar.5’


2023-04-29 18:21:10 (20.2 MB/s) - ‘VOCtrainval_06-Nov-2007.tar.5’ saved [460032000/460032000]

--2023-04-29 18:21:15--  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 451020800 (430M) [application/x-tar]
Saving to: ‘VOCtest_06-Nov-2007.tar.4’


2023-04-29 18:21:35 (22.5 MB/s) - ‘VOCtest_06-Nov-2007.tar.4’ saved [451020800/451020800]



In [2]:
def read_data(split):
    assert split in ["trainval", "test"], f"Invalid split: {split}"

    # Define paths to image and annotation directories
    image_dir = f"VOCdevkit/VOC2007{split}/JPEGImages"
    annotation_dir = f"VOCdevkit/VOC2007{split}/Annotations"

    # Get list of image and annotation filenames
    image_filenames = sorted(os.listdir(image_dir))
    annotation_filenames = sorted(os.listdir(annotation_dir))

    # Get full paths to images and annotations
    image_paths = [os.path.join(image_dir, filename) for filename in image_filenames]
    annotation_paths = [os.path.join(annotation_dir, filename) for filename in annotation_filenames]

    return image_paths, annotation_paths


In [3]:
def preprocess_data(image, annotation):
    # Convert the image to a tensor
    image_tensor = tf.convert_to_tensor(image)
    image_tensor = tf.image.convert_image_dtype(image_tensor, tf.float32)
    image_tensor = tf.image.resize(image_tensor, (IMAGE_SIZE, IMAGE_SIZE))
    
    # Get the bounding boxes and labels from the annotations
    boxes = [ann["bbox"] for ann in annotation]
    labels = [ann["label"] for ann in annotation]
    
    # Convert the boxes and labels to tensors
    box_tensors = [tf.convert_to_tensor(box) for box in boxes]
    label_tensors = [tf.convert_to_tensor(label) for label in labels]
    
    # Convert the tensors to a dictionary
    target = {"bbox": tf.stack(box_tensors), "label": tf.stack(label_tensors)}
    
    return image_tensor, target


In [4]:
def create_model():
    # Load pre-trained VGG-16 model
    base_model = tf.keras.applications.VGG16(include_top=False, weights="imagenet")

    # Create Fast R-CNN model
    inputs = tf.keras.layers.Input(shape=(224, 224, 3))
    x = base_model(inputs)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(4096, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(4096, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    box_outputs = tf.keras.layers.Dense(NUM_CLASSES * 4, activation="linear")(x)
    label_outputs = tf.keras.layers.Dense(NUM_CLASSES + 1, activation="softmax")(x)
    outputs = tf.keras.layers.Concatenate()([box_outputs, label_outputs])
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    return model


In [5]:
def train_model(model, train_images, train_boxes, train_labels, val_images, val_boxes, val_labels):
    # Create optimizer and loss function
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
    loss_fn = tf.keras.losses.CategoricalCrossentropy()

    # Compile model
    model.compile(optimizer=optimizer, loss=loss_fn)

    # Train model
    history = model.fit(
        x=train_images,
        y={"box_outputs": train_boxes, "label_outputs": train_labels},
        validation_data=(val_images, {"box_outputs": val_boxes, "label_outputs": val_labels}),
        batch_size=BATCH_SIZE,
        epochs=10,
    )

    return history


In [6]:
def main():
    # Read data
    train_images, train_annotations = read_data("trainval")
    val_images, val_annotations = read_data("test")

    # Set hyperparameters
    NUM_CLASSES = 20 # The PASCAL VOC 2007 dataset has 20 object classes
    BATCH_SIZE = 16  # You can set this to a suitable value based on your available memory and GPU resources

    # Preprocess data
    train_images, train_boxes, train_labels = preprocess_data(train_images, train_annotations)
    val_images, val_boxes, val_labels = preprocess_data(val_images, val_annotations)

    # Create model
    model = create_model(NUM_CLASSES)

    # Load the weights
    model.load_weights(weights_path)

    # Make predictions on the images
    predictions = model.predict(images)

    # Train model
    history = train_model(model, train_images, train_boxes, train_labels, val_images, val_boxes, val_labels, batch_size=BATCH_SIZE)

    # Plot training history
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.legend()
    plt.show()

In [None]:
def load_annotations(annotation_path):
    tree = ET.parse(annotation_path)
    root = tree.getroot()
    
    boxes = []
    classes = []
    
    for obj in root.findall('object'):
        class_name = obj.find('name').text
        xmin = int(obj.find('bndbox/xmin').text)
        ymin = int(obj.find('bndbox/ymin').text)
        xmax = int(obj.find('bndbox/xmax').text)
        ymax = int(obj.find('bndbox/ymax').text)
        boxes.append([xmin, ymin, xmax, ymax])
        classes.append(class_name)
        
    return boxes, classes


# Define the path to the annotations file
annotation_path = 'VOCdevkit/VOC2007/Annotations'

# Load the annotations for each image
annotations = [load_annotations(os.path.join(annotation_path, f'{os.path.splitext(os.path.basename(path))[0]}.xml')) for path in image_paths]

# Load and preprocess each image
images = [preprocess_data(cv2.imread(path), annot) for path, annot in zip(image_paths, annotations)]

# Make predictions on the images
predictions = model.predict(images)

# Process the predictions
boxes, classes = process_predictions(predictions, confidence_threshold=0.5, overlap_threshold=0.5)

# Display each image with the detected objects marked
for path, box, cls in zip(image_paths, boxes, classes):
    display_image_with_boxes(path, box, cls)


In [None]:
def display_image_with_boxes(image_path, boxes, classes):
    # Load image
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Create figure and axes
    fig, ax = plt.subplots(1)
    
    # Display the image
    ax.imshow(img)
    
    # Plot each bounding box and class label
    for box, cls in zip(boxes, classes):
        x1, y1, x2, y2 = box
        w, h = x2 - x1, y2 - y1
        color = np.random.uniform(0, 1, size=3)
        
        # Add bounding box
        rect = plt.Rectangle((x1, y1), w, h, fill=False, linewidth=2.5, edgecolor=color)
        ax.add_patch(rect)
        
        # Add class label
        text = f"{cls}"
        ax.text(x1, y1, text, fontsize=10, bbox=dict(facecolor=color, alpha=0.5))
    
    # Show the plot
    plt.show()

# Define paths to some sample images
image_paths = ["VOCdevkit/VOC2007/JPEGImages/000001.jpg",
               "VOCdevkit/VOC2007/JPEGImages/000002.jpg",
               "VOCdevkit/VOC2007/JPEGImages/000003.jpg",
               "VOCdevkit/VOC2007/JPEGImages/000004.jpg",
               "VOCdevkit/VOC2007/JPEGImages/000005.jpg"]

# Load and preprocess each image
images = [preprocess_data(cv2.imread(path), annotations) for path in image_paths]

# Make predictions on the images
predictions = model.predict(images)

# Convert the predictions into human-readable labels
boxes, scores, classes = convert_predictions(predictions, ANCHORS, 
                                             NUM_CLASSES, 
                                             input_dims=(IMAGE_HEIGHT, IMAGE_WIDTH))
# Display each image with the detected objects marked
for path, box, cls in zip(image_paths, boxes, classes):
    display_image_with_boxes(path, box, cls)

from PIL import Image, ImageDraw, ImageFont

# Define function to draw boxes and labels on the image
def draw_boxes(image_path, boxes, classes):
    # Load image
    image = Image.open(image_path)
    # Define font
    font = ImageFont.truetype("arial.ttf", 12)
    # Create drawing object
    draw = ImageDraw.Draw(image)
    # Draw boxes and labels on image
    for box, cls in zip(boxes, classes):
        draw.rectangle(box, outline="red")
        draw.text((box[0], box[1] - 12), cls, font=font)
    # Display image
    image.show()

# Display each image with the detected objects marked
for path, box, cls in zip(image_paths, boxes, classes):
    draw_boxes(path, box, cls)