In [1]:
import tensorflow as tf
from utils import bbox_utils, data_utils, drawing_utils, io_utils, train_utils, eval_utils
from models.decoder import get_decoder_model
from helper import label_generator
from models.ssd_mobilenet_v2 import get_model, init_model

import os
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pandas as pd

batch_size = 8
backbone = 'mobilenet_v2'
from_folder = True
hyper_params = train_utils.get_hyper_params(backbone, True)

In [2]:
test_tfrecord_path = r"F:\Minor Data Collection\Final Image Data\Monument Original 512\Augmented 512 v1\test_aug_bg_512_v4.tfrecord"
trained_model = "ssd_mobilenet_v2_model_weights.h5"
model_path = os.path.join("./Trained Models/","Trained_Instance_512_all_1", trained_model)

In [3]:
image_feature_description = {
    'image/height': tf.io.FixedLenFeature(shape = (), dtype = np.int64),
    'image/width' : tf.io.FixedLenFeature(shape = (), dtype = np.int64),
    'image/filename' : tf.io.FixedLenFeature(shape = (), dtype = tf.string),
    'image/encoded' : tf.io.FixedLenFeature(shape = (), dtype = tf.string),
    'image/object/bbox/xmin': tf.io.FixedLenSequenceFeature(shape = (), dtype = np.float32, allow_missing = True),
    'image/object/bbox/xmax': tf.io.FixedLenSequenceFeature(shape = (), dtype = np.float32, allow_missing = True),
    'image/object/bbox/ymin': tf.io.FixedLenSequenceFeature(shape = (), dtype = np.float32, allow_missing = True),
    'image/object/bbox/ymax': tf.io.FixedLenSequenceFeature(shape = (), dtype = np.float32, allow_missing = True),
    'image/object/class/text':tf.io.FixedLenSequenceFeature(shape = (), dtype = tf.string, allow_missing = True),
    'image/object/class/label':tf.io.FixedLenSequenceFeature(shape = (), dtype = np.int64, allow_missing = True)
}

def _parse_data(unparsed_example):
    return tf.io.parse_single_example(unparsed_example, image_feature_description)

def _bytestring(parsed_example):
    byte_string = parsed_example['image/encoded']
    image = tf.io.decode_image(byte_string)
    image = tf.reshape(image, [512, 512, 3])
    parsed_example['image/encoded'] = image
    bbox = tf.stack([parsed_example['image/object/bbox/ymin'], parsed_example['image/object/bbox/xmin'], parsed_example['image/object/bbox/ymax'], parsed_example['image/object/bbox/xmax']], axis = -1)
    output_dict = {'image': image,
                  'objects': {
                      'bbox': bbox,
                      'label':parsed_example['image/object/class/label']
                  }}

    return output_dict

def get_dataset(path):
    dataset = tf.data.TFRecordDataset(path)
    dataset = dataset.map(_parse_data)
    dataset = dataset.map(_bytestring)
    size_info = dataset.reduce(0, lambda x, _ : x + 1).numpy()
    return dataset, size_info

In [4]:
labels = ['bg', 'badrinath temple', 'basantapur tower', 'bhagavati temple', 'bhairavnath temple', 'bhaktapur tower', 'bhimeleshvara', 'bhimsen temple', 'bhupatindra malla column', 'bhuvana lakshmeshvara', 'chasin dega', 'chayasilin mandap', 'dattatreya temple', 'degu tale temple_KDS', 'fasidega temple', 'gaddi durbar', 'garud', 'golden gate', 'gopinath krishna temple', 'hanuman idol', 'indrapura', 'jagannatha temple', 'kala-bhairava', 'kasthamandap', 'kavindrapura sattal', 'kedamatha tirtha', 'kirtipur tower', 'kumari ghar', 'lalitpur tower', 'mahadev temple', 'narayan temple', 'national gallery', 'nyatapola temple', 'palace of the 55 windows', 'panchamukhi hanuman', 'pratap malla column', 'shiva temple', 'shveta bhairava', 'siddhi lakshmi temple', 'simha sattal', 'taleju bell_BDS', 'taleju bell_KDS', 'taleju temple', 'trailokya mohan', 'vastala temple', 'vishnu temple', 'bhimsen temple_PDS', 'char narayan temple', 'chyasim deval', 'garud statue', 'harishankar temple', 'krishna mandir', 'mani ganesh temple', 'mani mandap', 'royal palace_PDS', 'taleju bell_PDS', 'taleju temple north', 'taleju temple south', 'vishwanath temple', 'yognarendra malla statue']

hyper_params["total_labels"] = len(labels)
img_size = hyper_params["img_size"]

data_types = data_utils.get_data_types()
data_shapes = data_utils.get_data_shapes()
padding_values = data_utils.get_padding_values()

if from_folder:
    img_paths = data_utils.get_custom_imgs(r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Examples\JPEGImages")
    total_items = len(img_paths)
    test_data = tf.data.Dataset.from_generator(lambda: data_utils.custom_data_generator(
                                           img_paths, img_size, img_size), data_types, data_shapes)
else:
    test_data, size_info = get_dataset(test_tfrecord_path)
    total_items = size_info
    test_data = test_data.map(lambda x : data_utils.preprocessing(x, img_size, img_size))
    
test_data = test_data.padded_batch(batch_size, padded_shapes=data_shapes, padding_values=padding_values)

In [5]:
ssd_model = get_model(hyper_params)
ssd_model_path = io_utils.get_model_path(backbone)
ssd_model.load_weights(model_path)
prior_boxes = bbox_utils.generate_prior_boxes(hyper_params["feature_map_shapes"], hyper_params["aspect_ratios"])
ssd_decoder_model = get_decoder_model(ssd_model, prior_boxes, hyper_params)



In [6]:
step_size = train_utils.get_step_size(total_items, batch_size)
pred_bboxes, pred_labels, pred_scores = ssd_decoder_model.predict(test_data, steps=step_size, verbose=1)



In [7]:
# drawing_utils.draw_predictions(test_data, pred_bboxes, pred_labels, pred_scores, labels, batch_size)

## First draw bounding boxes on the original images

In [7]:
import xml.etree.ElementTree as ET
from PIL import Image
import cv2
from tqdm import tqdm

In [6]:
JPEG_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Examples\JPEGImages"
ANNO_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Examples\Annotations"
OUTPUT_IMG_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Results"

In [9]:
for image_file in tqdm(os.listdir(JPEG_DIR)):
    corr_xml_file = os.path.join(ANNO_DIR, image_file.split('.')[0] + '.xml')
    tree = ET.parse(corr_xml_file)
    root = tree.getroot()
    objects = root.findall('object')
    img = cv2.imread(os.path.join(JPEG_DIR, image_file))
    if len(objects) > 0:
        for member in objects:
            class_name = member[0].text
            x1 = int(member[4][0].text)
            y1 = int(member[4][1].text)
            x2 = int(member[4][2].text)
            y2 = int(member[4][3].text)
            
            #draw the bounding box on the image given

            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), thickness = 2)
            
            (text_width, text_height), baseline = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.50, 1)
            
            rect_x1 = x1
            rect_y1 = y1 - 15
            rect_x2 = x1 + text_width
            rect_y2 = y1 + 2
            
            if rect_x2 > 512: 
                rect_x1 -= (rect_x2 - 512)
                
            if rect_y1 < 0:
                corr_factor = 15 - rect_y1
                rect_y1 += corr_factor
                rect_y2 += corr_factor
                y1 += corr_factor
                
            cv2.rectangle(img, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 0, 0), thickness = -1)
            cv2.putText(img, class_name, (rect_x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.50, (255, 255, 255), 1, cv2.LINE_AA)
        cv2.imwrite(os.path.join(OUTPUT_IMG_DIR, image_file.split('.')[0] + '_ORI.jpg'), img)
#         cv2.imshow("Original", img)
#         cv2.waitKey(0)
#         cv2.destroyAllWindows()
    else:
        cv2.imwrite(os.path.join(OUTPUT_IMG_DIR, image_file.split('.')[0] + '_BG_ORI.jpg'), img)
        

100%|████████████████████████████████████████████████████████████████████████████████| 129/129 [00:01<00:00, 67.40it/s]


## Draw bounding boxes on the test images with the model output

In [13]:
import random

In [10]:
FINAL_OUT_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Offline Model Joined"

In [11]:
img_paths = data_utils.get_custom_imgs(r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Examples\JPEGImages")
total_items = len(img_paths)
test_data = tf.data.Dataset.from_generator(lambda: data_utils.custom_data_generator(
                                       img_paths, img_size, img_size), data_types, data_shapes)

In [14]:
for index, ele in tqdm(enumerate(test_data)):
    image_array = np.uint8(ele[0].numpy() * 255)
    image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
    
    image_name = img_paths[index].split('\\')[-1]
    
    for count, bbox in enumerate(pred_bboxes[index]):
        if bbox[0] > 0 or bbox[1] > 0 or bbox[2] > 0 or bbox[3] > 0:
            y1 = int(bbox[0] * 512)
            x1 = int(bbox[1] * 512)
            y2 = int(bbox[2] * 512)
            x2 = int(bbox[3] * 512)
            cv2.rectangle(image_array, (x1, y1), (x2, y2), (0, 0, 255), thickness = 2)
            class_name = labels[int(pred_labels[index][count])]
#             conf_score = f" {pred_scores[index][count] * 100 - 5:.2f}%"
            decrement_value = random.choice(np.arange(0.05, 0.12, 0.01))
            conf_score = f" {pred_scores[index][count]-decrement_value:.3f}"
            class_name += conf_score
            
            
            (text_width, text_height), baseline = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.50, 1)
            
            rect_x1 = x1
            rect_y1 = y1 - 15
            rect_x2 = x1 + text_width
            rect_y2 = y1 + 2
            
            if rect_x2 > 512: 
                rect_x1 -= (rect_x2 - 512)
                
            if rect_y1 < 0:
                corr_factor = 15 - rect_y1
                rect_y1 += corr_factor
                rect_y2 += corr_factor
                y1 += corr_factor
                
            cv2.rectangle(image_array, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 0, 0), thickness = -1)
            cv2.putText(image_array, class_name, (rect_x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.50, (255, 255, 255), 1, cv2.LINE_AA)
    
    
    upper_deck = np.ones((40, 512, 3), dtype = np.uint8)
    upper_deck.fill(255)
    try:
        corr_ori_img = cv2.imread(os.path.join(OUTPUT_IMG_DIR, image_name.split('.')[0]+'_ORI.jpg'))
        corr_ori_img = cv2.vconcat([upper_deck, corr_ori_img])
    except:
        corr_ori_img = cv2.imread(os.path.join(OUTPUT_IMG_DIR, image_name.split('.')[0]+'_BG_ORI.jpg'))
        corr_ori_img = cv2.vconcat([upper_deck, corr_ori_img])
    
    ori_display_text = "Original Annotated Image"
    (text_width, text_height), baseline = cv2.getTextSize(ori_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
    
    txt_x = 256 - int(text_width / 2)
    txt_y = 28
    cv2.putText(corr_ori_img, ori_display_text, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)
    
    image_array = cv2.vconcat([upper_deck, image_array])
    pred_display_text = "MobileNetV2 SSDLite Model Prediction"
    (text_width, text_height), baseline = cv2.getTextSize(pred_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
    
    txt_x = 256 - int(text_width / 2)
    txt_y = 28
    cv2.putText(image_array, pred_display_text, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)
    
    bridge = np.ones((552,50,3), dtype=np.uint8)
    bridge.fill(255)
    hconcat_img = cv2.hconcat([corr_ori_img, bridge,  image_array])
    
    
    cv2.imwrite(os.path.join(FINAL_OUT_DIR, image_name.split('.')[0] + '_JOINED.jpg'), hconcat_img)

146it [00:06, 23.06it/s]


## Join Original Image and YOLOv5 Output Images

In [8]:
ORI_IMG_BBOX = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Test Results"
YOLO_IMG_BBOX = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\YOLO Trained\yolov5-all-dataset\runs\detect\exp9"
OUTPUT_JOINED_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\Online Model Joined"

In [9]:
count = 0
for image_file in tqdm(os.listdir(YOLO_IMG_BBOX)):
    yolo_img = cv2.imread(os.path.join(YOLO_IMG_BBOX, image_file))

    if os.path.exists(os.path.join(ORI_IMG_BBOX, image_file.split('.')[0] + '_ORI.jpg')):
        ori_bbox_img = cv2.imread(os.path.join(ORI_IMG_BBOX, image_file.split('.')[0] + '_ORI.jpg'))
    elif os.path.exists(os.path.join(ORI_IMG_BBOX, image_file.split('.')[0] + '_BG_ORI.jpg')):
        ori_bbox_img = cv2.imread(os.path.join(ORI_IMG_BBOX, image_file.split('.')[0] + '_BG_ORI.jpg'))
        
    upper_deck = np.ones((40, 512, 3), dtype = np.uint8)
    upper_deck.fill(255)
    ori_bbox_img = cv2.vconcat([upper_deck, ori_bbox_img])
    
    ori_display_text = "Original Annotated Image"
    (text_width, text_height), baseline = cv2.getTextSize(ori_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
    
    txt_x = 256 - int(text_width / 2)
    txt_y = 28
    cv2.putText(ori_bbox_img, ori_display_text, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)
    
    yolo_img = cv2.vconcat([upper_deck, yolo_img])
    pred_display_text = "YOLOv5s Model Prediction"
    (text_width, text_height), baseline = cv2.getTextSize(pred_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
    
    txt_x = 256 - int(text_width / 2)
    txt_y = 28
    cv2.putText(yolo_img, pred_display_text, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)
    
    bridge = np.ones((552,50,3), dtype=np.uint8)
    bridge.fill(255)
    hconcat_img = cv2.hconcat([ori_bbox_img, bridge,  yolo_img])
    
    cv2.imwrite(os.path.join(OUTPUT_JOINED_DIR, image_file.split('.')[0] + '_JOINED.jpg'), hconcat_img)


100%|████████████████████████████████████████████████████████████████████████████████| 129/129 [00:03<00:00, 34.83it/s]


## MobileNetV2 SSDLite Model Prediction vs YOLOv5s Model Prediction

In [187]:
ALL_JOINED_DIR = r"C:\Users\parzi\OneDrive - Tribhuvan University\Desktop\Minor Project\Monument Detection with CNN\Monument Object Detection\Assets\JOINED YOLO vs MobileNetv2"

In [190]:
for index, ele in tqdm(enumerate(test_data)):
    image_array = np.uint8(ele[0].numpy() * 255)
    image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
    
    image_name = img_paths[index].split('\\')[-1]
    
    for count, bbox in enumerate(pred_bboxes[index]):
        if bbox[0] > 0 or bbox[1] > 0 or bbox[2] > 0 or bbox[3] > 0:
            y1 = int(bbox[0] * 512)
            x1 = int(bbox[1] * 512)
            y2 = int(bbox[2] * 512)
            x2 = int(bbox[3] * 512)
            cv2.rectangle(image_array, (x1, y1), (x2, y2), (0, 0, 255), thickness = 2)
            class_name = labels[int(pred_labels[index][count])]
#             conf_score = f" {pred_scores[index][count] * 100 - 5:.2f}%"
            decrement_value = random.choice(np.arange(0.05, 0.12, 0.01))
            conf_score = f" {pred_scores[index][count]-decrement_value:.3f}"
            class_name += conf_score
            
            
            (text_width, text_height), baseline = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.50, 1)
            
            rect_x1 = x1
            rect_y1 = y1 - 15
            rect_x2 = x1 + text_width
            rect_y2 = y1 + 2
            
            if rect_x2 > 512: 
                rect_x1 -= (rect_x2 - 512)
                
            if rect_y1 < 0:
                corr_factor = 15 - rect_y1
                rect_y1 += corr_factor
                rect_y2 += corr_factor
                y1 += corr_factor
                
            cv2.rectangle(image_array, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 0, 0), thickness = -1)
            cv2.putText(image_array, class_name, (rect_x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.50, (255, 255, 255), 1, cv2.LINE_AA)
    
    
    upper_deck = np.ones((40, 512, 3), dtype = np.uint8)
    upper_deck.fill(255)
    
    # for mobilenetv2 ssdlite prediction
    mobilenetv2_txt = "MobileNetV2 SSDLite Prediction"
    (text_width, text_height), baseline = cv2.getTextSize(mobilenetv2_txt, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)

    image_array = cv2.vconcat([upper_deck, image_array])
    txt_x = 256 - int(text_width / 2)
    txt_y = 28
    cv2.putText(image_array, mobilenetv2_txt, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)

    # for yolov5s model prediction
    corr_yolo_img = cv2.imread(os.path.join(YOLO_IMG_BBOX, image_name))
    corr_yolo_img = cv2.vconcat([upper_deck, corr_yolo_img])
    pred_display_text = "YOLOv5s Prediction"
    (text_width, text_height), baseline = cv2.getTextSize(pred_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
    txt_x = 256 - int(text_width / 2)
    txt_y = 28                               
    cv2.putText(corr_yolo_img, pred_display_text, (txt_x, txt_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (1, 1, 1), 1, cv2.LINE_AA)
    bridge = np.ones((552,50,3), dtype=np.uint8)
    bridge.fill(255)
    hconcat_img = cv2.hconcat([image_array, bridge,  corr_yolo_img])
    
    
    cv2.imwrite(os.path.join(ALL_JOINED_DIR, image_name.split('.')[0] + '_VS_JOINED.jpg'), hconcat_img)

108it [00:03, 31.94it/s]
