## **Data Preprocessing**

In [0]:
# Download VOC2012 tar file
!wget 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar'

# Unzip the tar file VOCtrainval_11-May-2012.tar,
!tar -xvf 'VOCtrainval_11-May-2012.tar'


In [0]:
# Import packages needed for creation of the dataset and descriptive analysis 

import shutil
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import csv
import os

In [0]:
# Create a subset of images to use, using the images specified in trainval.txt.

vocpath = 'VOCdevkit/VOC2012/'

# extract the names of the images that will be used for object detection from  'VOCdevkit/VOC2012/ImageSets/Layout/trainval.txt'

image_names_tv = list()
for line in open(vocpath + 'ImageSets/Layout/trainval.txt', 'r', encoding="utf8"):
    strip_line = line.strip()
    image_names_tv.append(strip_line[:-3])

# create a new directory to store the images and annotations needed.
%mkdir 'dataset'
%mkdir 'dataset/images'
%mkdir 'dataset/annotations'

for item in image_names_tv:
  shutil.copyfile(vocpath + 'JPEGImages/' + str(item) + '.jpg', 'dataset/images/' + str(item) + '.jpg')
  shutil.copyfile(vocpath + 'Annotations/' + str(item) + '.xml', 'dataset/annotations/' + str(item) + '.xml')

In [0]:
# Create a csv file containing all the annotations for the dataset by extracting them from the xml annotation files.

def xml_to_csv(path, folder):
    xml_list = []
    n = 11
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for object_ in root.findall('object'):
            for i in range(0,len(object_)):
                for k in range(0,len(object_)):
                    if object_[k].tag == "name":
                        name = object_[k].text
                path_name = str(path[:-n]) + folder + "/" + str(root.find('filename').text)
                for size in root.iter('size'):
                    for j in range(0,len(size)):
                        if size[j].tag == 'height':
                            height = int(size[j].text)
                        elif size[j].tag == 'width':
                            width = int(size[j].text)
                if len(object_[i]) == 4:
                    for j in range(0,len(object_[i])):
                        if object_[i][j].tag == 'xmin':
                            xmin = int(object_[i][j].text)
                        elif object_[i][j].tag == 'xmax':
                            xmax = int(object_[i][j].text)
                        elif object_[i][j].tag == 'ymin':
                            ymin = int(object_[i][j].text)
                        elif object_[i][j].tag == 'ymax':
                            ymax = int(object_[i][j].text)
                    value = (path_name,width,height,name,xmin,ymin,xmax,ymax)
                    xml_list.append(value) 
                elif len(object_[i]) == 2:
                    for k in range(0,len(object_[i])):
                        if object_[i][k].tag == "name":
                            name = object_[i][k].text
                        if len(object_[i][k]) == 4:
                            for j in range(0,len(object_[i][k])):
                                if object_[i][k][j].tag == 'xmin':
                                    xmin = int(object_[i][k][j].text)
                                elif object_[i][k][j].tag == 'xmax':
                                    xmax = int(object_[i][k][j].text)
                                elif object_[i][k][j].tag == 'ymin':
                                    ymin = int(object_[i][k][j].text)
                                elif object_[i][k][j].tag == 'ymax':
                                    ymax = int(object_[i][k][j].text)
                    value = (path_name,width,height,name,xmin,ymin,xmax,ymax)
                    xml_list.append(value) 
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

def main():
  image_path = os.path.join(os.getcwd(), ('dataset/annotations'))
  folder = 'images'
  xml_df = xml_to_csv(image_path, folder)
  xml_df.to_csv(('data/' + folder + '_labels.csv'), index=None)
  print('Successfully converted xml to csv.')

%mkdir 'data'
main()

Successfully converted xml to csv.


## **Data Descriptive Analysis**

In [0]:
# Descriptive Anlaysis:

data_csv = 'data/images_labels.csv'
data_csv = pd.read_csv(data_csv) 

# Create list of columns from csv file:

class_csv = data_csv.loc[: , "class"]
filenames_csv = data_csv.loc[: , "filename"]
xmin = data_csv.loc[: , "xmin"]
xmax = data_csv.loc[: , "xmax"]
ymin = data_csv.loc[: , "ymin"]
ymax = data_csv.loc[: , "ymax"]
width_csv = data_csv.loc[: , "width"]
height_csv = data_csv.loc[: , "height"]

#Find all the different Classes and count the number of each:
classes = list()
for row in class_csv:
    if row not in classes:
        classes.append(row)

count_classes = list()
for item in classes:
    counter = 0
    for element in class_csv:
        if item == element:
            counter += 1
    count_classes.append(counter)

print('Class:  Count')
print(" ")
for i in range(0,len(count_classes)): 
  print(str(classes[i]) + ' :', count_classes[i])

# Outputs the labels, person, head, hand, foot, chair, sofa, diningtable and potted plant.

print(" ")
print("---------------------------")
print(" ")

# Remove the labels for the classes, chair, sofa, diningtable and potted plant and create a new csv file.
remove_classes_list = list()
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']

for i in range(0,len(class_csv)):
  if class_csv[i] != 'sofa' and class_csv[i] != 'diningtable' and class_csv[i] != 'chair' and class_csv[i] != 'pottedplant':
    value = (filenames_csv[i],width_csv[i],height_csv[i],class_csv[i],xmin[i],ymin[i],xmax[i],ymax[i])
    remove_classes_list.append(value)

df = pd.DataFrame(remove_classes_list, columns=column_name)
df.to_csv(('data/dataset_labels_1.csv'), index=None)


# Count the number of instances of each class in the new csv file:

new_data_csv = 'data/dataset_labels_1.csv'
new_data_csv = pd.read_csv(new_data_csv) 

class_csv = new_data_csv.loc[: , "class"]
filenames_csv = new_data_csv.loc[: , "filename"]
xmin = new_data_csv.loc[: , "xmin"]
xmax = new_data_csv.loc[: , "xmax"]
ymin = new_data_csv.loc[: , "ymin"]
ymax = new_data_csv.loc[: , "ymax"]
width_csv = new_data_csv.loc[: , "width"]
height_csv = new_data_csv.loc[: , "height"]

classes = list()
for row in class_csv:
    if row not in classes:
        classes.append(row)

count_classes = list()
for item in classes:
    counter = 0
    for element in class_csv:
        if item == element:
            counter += 1
    count_classes.append(counter)

print('Class:  Count')
print(" ")
for i in range(0,len(count_classes)): 
  print(str(classes[i]) + ' :', count_classes[i])

# Outputs the labels, person, head, hand and foot.

print(" ")
print("---------------------------")
print(" ")

# Count the number of objects in each image:

filename_list = list()
for row in filenames_csv:
    if row not in filename_list:
        filename_list.append(row)

count_list = list()
for item in filename_list:
    counter = 0
    for i, element in enumerate(filenames_csv):
        if item == element:
            counter += 1
    count_list.append(counter)

# Create a csv file containing the number of objects in each image,

xml_df = pd.DataFrame(count_list, columns=['count'])
xml_df.to_csv(('data/count_objects.csv'), index=None)

# Create a list of the number of images containing n objects for n in [1,47].

bin_count_list = list()
for i in range(1,50):
    counter = 0
    for item in count_list:
        if i == item:
            counter += 1
    bin_count_list.append(counter)

print('#Objects: #Images')
print("")
for i in range(0,len(bin_count_list)): 
  print(str(i + 1) + ' :', bin_count_list[i])


In [0]:
data_count_objects = pd.read_csv('data/count_objects.csv')

# Show statistics for the number of objects per image.

pd.set_option('display.width', 100)
pd.set_option('precision', 2)
description_count_objects = data_count_objects.describe()
print(description_count_objects)

## **Implementation of Mask R-CNN Model**

In [0]:
# Initial Setup:
#   Download mask_rcnn_coco weights
#   Install tensorflow version 1.15 and keras 2.2.5
#   CloneMatterport's Mask R-CNN repository from Github.
#   Install all necessary packages.

!wget https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5

%tensorflow_version 1.15
import tensorflow

!pip install q keras==2.2.5

!git clone https://github.com/matterport/Mask_RCNN.git
%cd Mask_RCNN/
!python setup.py install
!pip show mask-rcnn

from os import listdir
from numpy import zeros
from numpy import asarray
from mrcnn.utils import Dataset
from mrcnn.config import Config
from mrcnn.model import MaskRCNN
from matplotlib import pyplot
import pandas as pd
from mrcnn.visualize import display_instances
from mrcnn.visualize import plot_precision_recall
from mrcnn.utils import extract_bboxes
from mrcnn.utils import compute_ap
from mrcnn.utils import compute_ap_range
from mrcnn.utils import compute_recall
from mrcnn.model import load_image_gt
from mrcnn.model import mold_image
from numpy import expand_dims
from numpy import mean
from mrcnn.model import mold_image
from matplotlib.patches import Rectangle
from numpy import expand_dims

%cd ..

In [0]:
class people(Dataset):
  def load_dataset(self, dataset_dir, is_train=True):
    self.add_class("dataset", 1, "person")
    self.add_class("dataset", 2, "head")
    self.add_class("dataset", 3, "hand")
    self.add_class("dataset", 4, "foot")
		# define data locations
    images_dir = dataset_dir + '/images/'
    annotations_dir = dataset_dir + '/annotations/'
    counter = 1
    images = os.listdir(images_dir)
    sorted_images = sorted(images)
    for filename in sorted_images:
			# skip all images after 150 if we are building the train set
      if is_train and counter >= 456:
        continue
			# skip all images before 150 if we are building the test/val set
      if not is_train and counter < 456:
        counter += 1
        continue
      img_path = images_dir + filename
      ann_path = annotations_dir + filename[:-4] + '.xml'
			# add to dataset
      self.add_image('dataset', image_id=filename[:-4], path=img_path, annotation=ann_path)
      counter += 1

	# extract bounding boxes from 'data/dataset_labels_1.csv' file
  def extract_boxes(self, filename):
        boxes = list()
        clss_list = list()
        for i, item in enumerate(filenames_csv):
            if item[-15:-4] == filename[-15:]:
                coors = [xmin[i], ymin[i], xmax[i], ymax[i]]
                clss = class_csv[i]
                width = width_csv[i]
                height = height_csv[i]
                boxes.append(coors)
                clss_list.append(clss)
        return boxes, width, height, clss_list

	# load the masks for an image
  def load_mask(self, image_id):
		# get details of image
        info = self.image_info[image_id]
		# define box file location
        image_name = info['id']
		# extract: boxes, width, height and list of classes
        boxes, w, h, clss_list = self.extract_boxes(image_name)
		# create one array for all masks, each on a different channel
        masks = zeros([h, w, len(boxes)], dtype='uint8')
		# create masks from bounding boxes
        class_ids = list()
        for i in range(len(boxes)):
            box = boxes[i]
            row_s, row_e = box[1], box[3]
            col_s, col_e = box[0], box[2]
            masks[row_s:row_e, col_s:col_e, i] = 1
            class_ids.append(self.class_names.index(clss_list[i]))
        return masks, asarray(class_ids, dtype='int32')
        #return masks, boxes, asarray(class_ids, dtype='int32')

	# load an image reference
  def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']

# define a configuration for the model
class mrcnnConfig(Config):
	# define the name of the configuration
	NAME = "mask_rcnn_cfg"
	# number of classes (background + people, head, hands, foot)
	NUM_CLASSES = 1 + 4
	# number of training steps per epoch, this is equal to the number of images in the Training set
	STEPS_PER_EPOCH = 455

# create train set
train_set = people()
train_set.load_dataset('dataset', is_train=True)
train_set.prepare()
print('Train: %d' % len(train_set.image_ids))

# create test set
test_set = people()
test_set.load_dataset('dataset', is_train=False)
test_set.prepare()
print('Test: %d' % len(test_set.image_ids))


In [0]:
# Optional: display some instances from the training set.

for i in range(0,5): 
  image_id = i
  # load the image
  image = train_set.load_image(image_id)
  # load the masks and the class ids
  mask, class_ids = train_set.load_mask(image_id)
  # extract bounding boxes from the masks
  bbox = extract_bboxes(mask)
  # display image with masks and bounding boxes
  display_instances(image, bbox, mask, class_ids, train_set.class_names)

In [0]:
# Model Training:

# Create configuration for the model

class mrcnnConfig(Config):
    # define the name of the configuration
    NAME = "mrcnn_final_model"
    # number of classes (background + kangaroo)
    NUM_CLASSES = 1 + 4
    # number of training steps per epoch
    STEPS_PER_EPOCH = 455
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    LEARNING_RATE = 0.005
    LEARNING_MOMENTUM = 0.7
    WEIGHT_DECAY = 0.00005
    BACKBONE = 'resnet101'
    LOSS_WEIGHTS = {
        "rpn_class_loss": 1.,
        "rpn_bbox_loss": 1.,
        "mrcnn_class_loss": 3.5,
        "mrcnn_bbox_loss": 3.5,
        "mrcnn_mask_loss": 1.
    }

# Prepare config
config = mrcnnConfig()
config.display()

# Define the model
model = MaskRCNN(mode='training', model_dir='./', config=config)
# Load weights (mscoco) and exclude the output layers
model.load_weights('mask_rcnn_coco.h5', by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",  "mrcnn_bbox", "mrcnn_mask"])
# Tune pre trained weights 
model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=10, layers='heads')

#Rename output folder of trained model checkpoints
directory = os.listdir('/content')
for folder in directory:
  if folder[:17] == 'mrcnn_final_model' and folder[:-1] != 'mrcnn_final_model' and folder != 'mrcnn_final_model':
    %mv $folder 'mrcnn_final_model'

## **Model Performance**

In [0]:
# define the prediction configuration
class PredictionConfig(Config):
	# define the name of the configuration
	NAME = "mrcnn_prediction_cfg"
	BACKBONE = 'resnet101'
	# number of classes (background + kangaroo)
	NUM_CLASSES = 1 + 4
	# simplify GPU config
	GPU_COUNT = 1
	IMAGES_PER_GPU = 1

def evaluate_model(dataset, model, cfg):
 APs = list()
 for image_id in dataset.image_ids:
		# load image, bounding boxes and masks for the image id
		image, image_meta, gt_class_id, gt_bbox, gt_mask = load_image_gt(dataset, cfg, image_id, use_mini_mask=False)
		# convert pixel values (e.g. center)
		scaled_image = mold_image(image, cfg)
		# convert image into one sample
		sample = expand_dims(scaled_image, 0)
		# make prediction
		yhat = model.detect(sample, verbose=0)
		# extract results for first sample
		r = yhat[0]
		# calculate statistics, including AP
		AP, _,_,_ = compute_ap(gt_bbox, gt_class_id, gt_mask, r["rois"], r["class_ids"], r["scores"], r['masks'])
		APs.append(AP)
 # calculate the mean AP across all images
 mAP = mean(APs)
 return mAP

# create config
cfg = PredictionConfig()
# define the model
model = MaskRCNN(mode='inference', model_dir='./', config=cfg)

numblist = ['01','02','03','04','05','06','07','08','09','10','11','12']
max_test_mAP = 0
best_epoch = 0
for i in range(4,10):
  # load model weights
  model.load_weights('mrcnn_final_model/mask_rcnn_mrcnn_final_model_00' + str(numblist[i]) +'.h5', by_name=True)
  # evaluate model on training dataset
  train_mAP = evaluate_model(train_set, model, cfg)
  print('Train mAP for epoch ' + str(i+1) + ' :')
  print("%.3f" % train_mAP)
  print("")
  # evaluate model on test dataset
  test_mAP = evaluate_model(test_set, model, cfg)
  print('Test mAP for epoch ' + str(i+1) + ' :')
  print("%.3f" % test_mAP)
  print("")
  if test_mAP > max_test_mAP:
    max_test_mAP = test_mAP
    best_epoch = i


In [0]:
classes = ['person', 'head', 'hand', 'foot']

# plot a number of photos with ground truth and predictions
def plot_actual_vs_predicted(dataset, model, cfg, image_numb):
  # load the image and mask
  image = dataset.load_image(image_numb)
  mask, _ = dataset.load_mask(image_numb)
  # convert pixel values (e.g. center)
  scaled_image = mold_image(image, cfg)
  # convert image into one sample
  sample = expand_dims(scaled_image, 0)
  # make prediction
  yhat = model.detect(sample, verbose=0)[0]
  pyplot.imshow(image)
  pyplot.title('Actual')
  # plot masks
  for j in range(mask.shape[2]):
    pyplot.imshow(mask[:, :, j], cmap='gray', alpha=0.3)
  pyplot.show()
  # get the context for drawing boxes
  pyplot.imshow(image)
  pyplot.title('Predicted')
  ax = pyplot.gca()
  # plot each box
  for val, box in enumerate(yhat['rois']):
    # get coordinates
    y1, x1, y2, x2 = box
    # calculate width and height of the box
    width, height = x2 - x1, y2 - y1
    # create the shape
    rect = Rectangle((x1, y1), width, height, fill=False, color='blue')
    labelnumb = yhat["class_ids"][val]
    label = classes[labelnumb-1]
    ax.text(x1, y1 + 10, label,color='red', size=11, backgroundcolor="none")
    # draw the box
    ax.add_patch(rect)
  # show the figure
  pyplot.show()

# Plot 8 random images from the test set, along with the actual and predicted bounding boxes

import random
model_path = 'mrcnn_final_model/mask_rcnn_mrcnn_final_model_00' + str(numblist[best_epoch]) + '.h5'
model.load_weights(model_path, by_name=True)
counter = 0
i = 0
while counter < 8:
  i = random.randrange(0,155)
  i = i + 1
  plot_actual_vs_predicted(test_set, model, cfg, i)
  counter += 1