In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
IMG_DIR = '/content/drive/MyDrive/Colab Notebooks/comp9517/'
ANN_FILE = '/content/drive/MyDrive/Colab Notebooks/comp9517/annotations.json'
SPLIT_FILE = '/content/drive/MyDrive/Colab Notebooks/comp9517/metadata_splits.csv'
BEST_PATH = '/content/drive/MyDrive/maskrcnn_model.pth'
EPOCHS = 10
INIT_LR = 0.0001
WEIGHT_DECAY = 0.0001


# Data Preparation
Replacing images without annotations or those with corrupted annotation counts. Sharpening images using unsharp masking[1] to highlight significant features and reduce ambiguity. Initializing a custom dataset class to define bounding boxes, labels, and masks in target formats. Splitting the dataset into training, validation, and test sets using labels from ‘metadata_split.csv’[2], ensures open-set splitting.

In [None]:
import skimage.io as io
from torchvision import datasets, transforms
from pycocotools.coco import COCO
from torch.utils.data import Dataset, Subset, DataLoader
import pandas as pd
import cv2 as cv
import random
import torch
import matplotlib.pyplot as plt
from pprint import pprint
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import torchvision.ops as ops
from tqdm.auto import tqdm
import torchvision
from torchvision.utils import draw_segmentation_masks


# File path
img_dir = IMG_DIR
ann_file = ANN_FILE
split_file = SPLIT_FILE
best_model_path = BEST_PATH
coco = COCO(ann_file)

# Hyperparameters
batch = 4

# Rescale
transform = transforms.Compose([
    transforms.ToTensor(),
])

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

category_labels = {
    "turtle": 1,
    "flipper": 2,
    "head": 3
}

# Helper function
# Return sample with valid annotations
def dummy_class(coco, transform):
    img_id = 1
    img = cv.imread(img_dir + coco.loadImgs([img_id])[0]['file_name'])
    img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

    # Unsharp masking
    if transform:
        blur = cv.GaussianBlur(img, (7, 7), 0)
        img = cv.addWeighted(img, 4, blur, -3, 0)
        img = transform(img)

    boxes = []
    labels = []
    masks = []

    # Mask RCNN needs boxes, labels and masks
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    for ann in anns:
        x, y, width, height = ann['bbox']
        boxes.append([x, y, x + width, y + height])
        labels.append(ann['category_id'])
        masks.append(coco.annToMask(ann))

    # Transform into boxes, labels and masks
    target = {
        'boxes': torch.as_tensor(boxes, dtype=torch.float32),
        'labels': torch.as_tensor(labels, dtype=torch.int64),
        'masks': torch.as_tensor(np.array(masks), dtype=torch.uint8),
        'id': torch.as_tensor(np.array([img_id]), dtype=torch.uint8),
    }

    return img, target


# Customised dataset
class TurtleDataset(Dataset):
    def __init__(self, img_dir, coco, transform=None):
        self.img_dir = img_dir
        self.coco = coco
        self.transform = transform
        self.img_ids = list(self.coco.getImgIds())
        self.cat_ids = self.coco.getCatIds()

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, i):
        img_id = self.img_ids[i]
        if (img_id in [3793, 7772, 8436]):
            return dummy_class(coco, transform)

        img = cv.imread(self.img_dir + self.coco.loadImgs([img_id])[0]['file_name'])
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

        # Unsharp masking
        if transform:
            blur = cv.GaussianBlur(img, (7, 7), 0)
            img = cv.addWeighted(img, 4, blur, -3, 0)
            img = self.transform(img)

        boxes = []
        labels = []
        masks = []

        # Mask RCNN needs boxes, labels and masks
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        for ann in anns:
            x, y, width, height = ann['bbox']
            boxes.append([x, y, x + width, y + height])
            labels.append(ann['category_id'])
            masks.append(self.coco.annToMask(ann))

        # Transform into boxes, labels and masks
        target = {
            'boxes': torch.as_tensor(boxes, dtype=torch.float32),
            'labels': torch.as_tensor(labels, dtype=torch.int64),
            'masks': torch.as_tensor(np.array(masks), dtype=torch.uint8),
            'id': torch.as_tensor(np.array([img_id]), dtype=torch.uint8),
        }

        return img, target

# Helper function
# Detecting images that does not contain any label.
def check_invalid(coco):
    img_ids = list(coco.getImgIds())
    for img_id in img_ids:
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Remove images that does not contain annotation.
        if len(ann_ids) == 0:
            print(f"Image {img_id} does not contain any annotation.")
            continue
        anns = coco.loadAnns(ann_ids)
        if len(anns) == 0:
            print(f"Image {img_id} does not contain any content in annotation.")
            continue
        # Remove images that contain corrupted annotation.
        for ann in anns:
            if not ('bbox' in ann and 'category_id' in ann and 'segmentation' in ann):
                print(f"Image {img_id} contains missing annotation.")
                break

check_invalid(coco)

# Helper function
# View original image and mask
def sample_image(dataset, id):
    print(f"Sample image and mask ({id}):")
    img, target = dataset.__getitem__(id)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(img.permute(1, 2, 0))
    plt.title("Image")
    plt.axis('off')
    mask = np.zeros_like(target['masks'][0].numpy(), dtype=np.uint8)
    for i, m in enumerate(target['masks']):
        mask[m.numpy() == 1] = target['labels'][i].item()
    plt.subplot(1, 2, 2)
    plt.imshow(mask)
    plt.title("Mask")
    plt.axis('off')
    plt.show()

# Process the list of sample from batch
def collate_fn(batch):
    images, targets = zip(*batch)
    return list(images), list(targets)

dataset = TurtleDataset(img_dir=img_dir, coco=coco, transform=transform)

# sample image
sample_image(dataset, 321)

# Split dataset - Ensure Open-set splitting
split = pd.read_csv(split_file)
train_data = Subset(dataset, list(split[(split['split_open'] == 'train')].index))
test_data = Subset(dataset, list(split[(split['split_open'] == 'test')].index))
valid_data = Subset(dataset, list(split[split['split_open'] == 'valid'].index))

# Vary proportion and test_prop to adjust sample proportion
proportion = 1
test_prop = 1
sub_ind = random.sample(range(len(train_data)), int(proportion * len(train_data)))
train_dataset = DataLoader(Subset(train_data, sub_ind), batch_size=batch, shuffle=True, collate_fn=collate_fn)

sub_ind = random.sample(range(len(valid_data)), int(test_prop * len(valid_data)))
valid_dataset = DataLoader(Subset(valid_data, sub_ind), batch_size=1, collate_fn=collate_fn)

sub_ind = random.sample(range(len(test_data)), int(test_prop * len(test_data)))
test_dataset = DataLoader(Subset(test_data, sub_ind), batch_size=1, collate_fn=collate_fn)

print("Full training set size: ", len(train_data))
print("Full validation set size: ", len(valid_data))
print("Full test set size: ", len(test_data))
print("Reduced training dataset size:", len(train_dataset)*batch)
print("Reduced validation dataset size:", len(valid_dataset))
print("Reduced test dataset size:", len(test_dataset))

# Model adjustments
The Mask R-CNN model [3] uses ResNet50 with a Feature Pyramid (FPN) [4] as its backbone, initialized with pre-trained weights. The FastRCNNPredictor and MaskRCNNPredictor heads are modified to adapt four output classes: background, turtle, flipper, and head. Experimental dropout is applied after the linear layers in the box prediction head to prevent overfitting, as the model tends to converge quickly.


In [None]:
from torchvision.models.detection import maskrcnn_resnet50_fpn_v2

def new_model(classes, dropout=0):
    # Model used: maskrcnn_resnet50_fpn_v2 with default weights
    model = maskrcnn_resnet50_fpn_v2(weights="DEFAULT")

    # Change predictors to adapt tasks
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
        model.roi_heads.box_predictor.cls_score.in_features, classes)
    model.roi_heads.mask_predictor = torchvision.models.detection.mask_rcnn.MaskRCNNPredictor(
        model.roi_heads.mask_predictor.conv5_mask.in_channels, 256, classes)

    # Experimental: Apply dropout layer to prevent overfitting
    box_layers = []
    for layer in model.roi_heads.box_head.children():
        box_layers.append(layer)
        if isinstance(layer, nn.Linear):
            box_layers.append(nn.Dropout(dropout))
    model.roi_heads.box_head = nn.Sequential(*box_layers)
    return model

classes = 4
dropout = 0.3
model = new_model(classes, dropout=dropout)

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


In [None]:
# Helper function
# Visualisation between original image, ground truth mask, and prediction mask
# Prediciton mask with confidence score < Threshold score will not be shown
def compare_visualise(img, targets, preds, thres):
    print('The image id is', targets['id'].item())
    cat_colour = {
        1: 'Reds',
        2: 'Greens',
        3: 'Blues',
    }
    plt.figure(figsize=(15, 10))
    plt.subplot(1, 3, 1)
    plt.imshow(img.permute(1, 2, 0).cpu().numpy())
    plt.title("Image")
    plt.axis("off")

    plt.subplot(1, 3, 2)
    for i in range(len(preds['masks'])):
        score = preds['scores'][i].item()
        if score > thres:
            cat = preds['labels'][i].item()
            mask = preds['masks'][i][0].cpu().numpy()
            plt.imshow(mask, alpha=0.5, cmap=cat_colour[cat], vmin=0, vmax=1)
    plt.title("Predicted")
    plt.axis("off")

    plt.subplot(1, 3, 3)
    for i in range(len(targets['masks'])):
        cat = targets['labels'][i].item()
        mask = targets['masks'][i].cpu().numpy()
        plt.imshow(mask, alpha=0.5, cmap=cat_colour[cat], vmin=0, vmax=1)

    plt.title("Expected")
    plt.axis("off")
    plt.show()


In [None]:
# Helper function
# Calculate precision, recall and F1-score for one sample
# Prediciton mask with confidence score < Threshold score will not be counted
def metric_calculation(pred, target, thres):
    pred_masks = {1: [], 2: [], 3: []}
    target_masks = {1: [], 2: [], 3: []}
    category_labels_rev = {
        1: "turtle",
        2: "flipper",
        3: "head",
    }
    # Precision, Recall, F1-score, and num accordingly
    record = {cat: [0.0, 0.0, 0.0, 0.0] for cat in category_labels}

    for i, label in enumerate(pred['labels']):
        cat = label.item()
        if cat in pred_masks and pred['scores'][i] > thres:
            pred_masks[cat].append(pred['masks'][i] > 0.5)

    for i, label in enumerate(target['labels']):
        cat = label.item()
        if cat in target_masks:
            target_masks[cat].append(target['masks'][i] > 0.5)

    # Calculate metrics for different classes
    for cat in pred_masks:
        if pred_masks[cat] and target_masks[cat]:
            precision = 0.0
            recall = 0.0

            # Need to combine the masks in pred_masks[cat], apply the same with target_masks[cat]
            combine_pred = torch.max(torch.stack(pred_masks[cat]), dim=0)[0]
            combine_target = torch.max(torch.stack(target_masks[cat]), dim=0)[0]

            # count number of pixels
            true_positive = torch.sum(combine_pred & combine_target).item()
            false_positive = torch.sum(combine_pred & (~combine_target)).item()
            false_negative = torch.sum((~combine_pred) & combine_target).item()

            precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0
            recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0
            f1_score = (2 * precision * recall) / (recall + precision) if (recall + precision) else 0.0
            record[category_labels_rev[cat]] = [precision, recall, f1_score, 1.0]

        elif not pred_masks[cat] and not target_masks[cat]:
            # If class not appear in both
            record[category_labels_rev[cat]] = [1.0, 1.0, 1.0, 1.0]
        elif (not pred_masks[cat] and target_masks[cat]) or (pred_masks[cat] and not target_masks[cat]):
            # If class only exists in either one
            record[category_labels_rev[cat]] = [0.0, 0.0, 0.0, 1.0]

    return record


# Validation and Testing
Validation occurs after each training epoch. For visualization, some samples are displayed with comparisons between the original image, predicted masks, and true masks. If a model achieves the best validation performance (lowest validation loss), its parameters and settings are saved.
Testing is conducted on the last trained model and the best-performing saved model. Plots [5] of epoch vs. IoU, Epoch vs. Training and Validation Loss will be shown and test loss and Mean IoU per category will present as well.
At the end, precision score, recall score and f1-score of each category will be calculated in testing.


In [None]:
# Helper function
# Calculate mean iou for each class
# Prediciton mask with confidence score < Threshold score will not be counted
def mask_iou(targets, pred):
    record = {cat: [0.0, 0.0] for cat in category_labels}
    pred_masks = {1: [], 2: [], 3: []}
    target_masks = {1: [], 2: [], 3: []}
    category_labels_rev = {
        1: "turtle",
        2: "flipper",
        3: "head",
    }

    for i, label in enumerate(pred['labels']):
        cat = label.item()
        if cat in pred_masks and pred['scores'][i] > 0.5:
            pred_masks[cat].append(pred['masks'][i] > 0.5)

    for i, label in enumerate(targets['labels']):
        cat = label.item()
        if cat in target_masks:
            target_masks[cat].append(targets['masks'][i] > 0.5)

    # Combine each category masks and calculate category iou
    for cat in pred_masks:
        if pred_masks[cat] and target_masks[cat]:
            total_pred_mask = torch.any(torch.stack(pred_masks[cat]), dim=0).cpu().numpy()
            total_target_mask = torch.any(torch.stack(target_masks[cat]), dim=0).cpu().numpy()

            intersection = (total_pred_mask * total_target_mask).sum()
            union = total_pred_mask.sum() + total_target_mask.sum() - intersection

            iou = intersection / union
            record[category_labels_rev[cat]][0] += iou
            record[category_labels_rev[cat]][1] += 1
        else:
            record[category_labels_rev[cat]][0] += 0
            record[category_labels_rev[cat]][1] += 1

    return record


# Validation
# Apply validation function after each training epoch to monitor learning trend
def validation(model, valid_dataset):
    model.eval()
    total_loss = 0.0

    # First for iou sum, second for count sum
    record = {cat: [0.0, 0.0] for cat in category_labels}
    with torch.no_grad():
        for imgs, targets in tqdm(valid_dataset, desc="Validation in progress"):
            imgs = [img.to(device) for img in imgs]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # temporarily switch to train and calculate validation loss
            model.train()
            loss_info = model(imgs, targets)
            batch_loss = sum(l for l in loss_info.values())
            total_loss += batch_loss.item()

            model.eval()
            pred = model(imgs)
            for i in range(len(imgs)):
                curr_iou = mask_iou(targets[i], pred[i])
                for cat in curr_iou:
                    record[cat][0] += curr_iou[cat][0]
                    record[cat][1] += curr_iou[cat][1]

            # Random visualisation
            prob = random.random()
            if prob < 0.01:
                for i in range(len(imgs)):
                    compare_visualise(imgs[i], targets[i], pred[i], thres=0.5)

    # Average loss and miou of each class
    avg_loss = total_loss / len(valid_dataset)
    mean_iou = {cat: record[cat][0] / record[cat][1] for cat in category_labels}

    return avg_loss, mean_iou

# Testing
# Apply testing after finishing training to evaluate performance.
def testing(model, test_dataset):
    model.eval()
    total_loss = 0.0
    # First for iou sum, second for count sum
    record = {cat: [0.0, 0.0] for cat in category_labels}
    metric = {cat: [0.0, 0.0, 0.0, 0.0] for cat in category_labels}

    with torch.no_grad():
        for imgs, targets in tqdm(test_dataset, desc="Testing in progress"):
            imgs = [img.to(device) for img in imgs]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # temporarily switch to train and calculate test loss
            model.train()
            loss_info = model(imgs, targets)
            batch_loss = sum(l for l in loss_info.values())
            total_loss += batch_loss.item()

            model.eval()
            pred = model(imgs)
            for i in range(len(imgs)):
                curr_iou = mask_iou(targets[i], pred[i])
                for cat in curr_iou:
                    record[cat][0] += curr_iou[cat][0]
                    record[cat][1] += curr_iou[cat][1]

                temp_metric = metric_calculation(pred[i], targets[i], 0.5)
                for cat in temp_metric.keys():
                    metric[cat][0] += temp_metric[cat][0]
                    metric[cat][1] += temp_metric[cat][1]
                    metric[cat][2] += temp_metric[cat][2]
                    metric[cat][3] += temp_metric[cat][3]


            # Random visualisation
            prob = random.random()
            if prob < 0.01:
                for i in range(len(imgs)):
                    compare_visualise(imgs[i], targets[i], pred[i], thres=0.5)

    # Average loss and miou of each class
    avg_loss = total_loss / len(test_dataset)
    mean_iou = {cat: record[cat][0] / record[cat][1] for cat in category_labels}
    final_metric = {cat: [metric[cat][0] / metric[cat][3], metric[cat][1] / metric[cat][3], metric[cat][2] / metric[cat][3]] for cat in category_labels}

    return avg_loss, mean_iou, final_metric

# Training Process
During training, the training and validation average losses and the mean Intersection over Union (IoU) for each category in the validation set are recorded to monitor accuracy and performance trends.
The Adam optimizer is used with an initial learning rate of 0.00001 and an L2 regularisation of 0.0001 to prevent rapid convergence into local minimum.
A multi-loss function [6] is applied to Mask R-CNN and the total loss is calculated by the sum of classification loss, mask loss, mask proposal loss, bounding box regression loss, and bounding box proposal loss.
To avoid overfitting, an early stopping mechanism will terminate training if no improvement occurs within 5 epochs, and a learning rate scheduler reduces the learning rate by half every 3 epochs. [7]


In [None]:
# Training Process with validation
def training(model, train_dataset, valid_dataset, opti, num_epochs):
    model.train()
    avg_loss_train = []
    avg_loss_valid = []
    miou_valid = []
    best_valid_loss = float('inf')
    patience = 0
    step_size = 3
    scheduler = optim.lr_scheduler.StepLR(opti, step_size, gamma=0.5)

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for imgs, targets in tqdm(train_dataset, desc=f"Epoch {epoch + 1}/{num_epochs} Training in progress"):
            imgs = [img.to(device) for img in imgs]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_info = model(imgs, targets)
            batch_loss = sum(l for l in loss_info.values())
            total_train_loss += batch_loss.item()

            opti.zero_grad()
            batch_loss.backward()
            opti.step()

        avg_train_loss = total_train_loss / len(train_dataset)
        avg_loss_train.append(avg_train_loss)

        # Validation step
        avg_valid_loss, curr_miou = validation(model, valid_dataset)
        avg_loss_valid.append(avg_valid_loss)
        miou_valid.append(curr_miou)

        # Trigger early stop if there is no improvement for 5 epoch
        if best_valid_loss < avg_valid_loss:
            patience += 1
            if patience >= 5:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
        else:
            patience = 0

        # Save the model if it has the best validation loss
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), best_model_path)
            print(f"Saved Best Model at Epoch {epoch + 1} with Validation Loss: {best_valid_loss:.4f}")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_valid_loss:.4f}")

        # Apply scheduler to adjust learning rate
        scheduler.step()
        print(f"Learning rate at epoch {epoch+1}: {scheduler.get_last_lr()[0]}")


    return avg_loss_train, avg_loss_valid, miou_valid, epoch+1




In [None]:
# Hyperparameter
epoch = EPOCHS
opti = optim.Adam(model.parameters(), lr=INIT_LR, weight_decay=WEIGHT_DECAY)
avg_loss_train, avg_loss_valid, miou_valid, final_epoch = training(model, train_dataset, valid_dataset, opti, epoch)


In [None]:
# Plot epoch vs average loss
epoch_list = [n for n in range(1, final_epoch+1)]
plt.plot(epoch_list, avg_loss_train, marker='o', linestyle='-', color='b', label='Train')
plt.plot(epoch_list, avg_loss_valid, marker='s', linestyle='--', color='r', label='Validation')

# Add labels and title
plt.xlabel("Epochs")
plt.ylabel("Average Loss")
plt.title("Epoch vs Loss")
plt.xticks(epoch_list)
plt.xlim(1, epoch_list[-1])
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot each list vs iou
miou_valid_turtle = [item['turtle'] for item in miou_valid]
miou_valid_flipper = [item['flipper'] for item in miou_valid]
miou_valid_head = [item['head'] for item in miou_valid]
miou_valid_avg = [(sum(item.values())/3) for item in miou_valid]
plt.plot(epoch_list, miou_valid_turtle, marker='o', linestyle='--', color='b', label='Turtle')
plt.plot(epoch_list, miou_valid_flipper, marker='s', linestyle='--', color='r', label='Flipper')
plt.plot(epoch_list, miou_valid_head, marker='*', linestyle='--', color='g', label='Head')
plt.plot(epoch_list, miou_valid_avg, marker='.', linestyle='-', color='y', label='Average')
plt.xlabel("Epochs")
plt.ylabel("IoU")
plt.title("Validation Epochs vs IoU")
plt.xticks(epoch_list)
plt.xlim(1, epoch_list[-1])
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Load the latest model
avg_loss_test, miou_test, final_metric = testing(model, test_dataset)
print(f"The average loss for test is {avg_loss_test:.4f}")
print(f"The miou for head, flipper and turtle are: {miou_test['head']:.4f}, {miou_test['flipper']:.4f}, {miou_test['turtle']:.4f}.")
print(f'The overall miou is {(sum(miou_test.values())/3):.4f}.')
for cat in final_metric:
    print(f'The metric for {cat} category is:')
    print('Precision:', final_metric[cat][0])
    print('Recall:', final_metric[cat][1])
    print('F1-score:', final_metric[cat][2])

In [None]:
# Load the best model
best_model = new_model(classes, dropout=dropout)
best_model.load_state_dict(torch.load(best_model_path, weights_only=True))
best_model.to(device)
avg_loss_test, miou_test, final_metric = testing(best_model, test_dataset)
print(f"The average loss for test is {avg_loss_test:.4f}")
print(f"The miou for head, flipper and turtle are: {miou_test['head']:.4f}, {miou_test['flipper']:.4f}, {miou_test['turtle']:.4f}.")
print(f'The overall miou is {(sum(miou_test.values())/3):.4f}.')
for cat in final_metric:
    print(f'The metric for {cat} category is:')
    print('Precision:', final_metric[cat][0])
    print('Recall:', final_metric[cat][1])
    print('F1-score:', final_metric[cat][2])

# Reference
1. Gonzalez, R. C., & Woods, R. E. (2002). Digital Image Processing https://dl.ebooksworld.ir/motoman/Digital.Image.Processing.3rd.Edition.www.EBooksWorld.ir.pdf
2. Wildlifedatasets (2024). SEATURTLEID2022. https://www.kaggle.com/datasets/wildlifedatasets/seaturtleid2022
3. Mask R-CNN - Torchvision main documentation. https://pytorch.org/vision/main/models/mask_rcnn.html
4. Lin, T.-Y., Dollár, P., Girshick, R., He, K., Hariharan, B., & Belongie, S. (2017). Feature Pyramid Networks for Object Detection. https://arxiv.org/abs/1612.03144
5. Visualization with Python. Matplotlib. https://matplotlib.org/
6. He, K., Gkioxari, G., Dollár, P., & Girshick, R. (2018). Mask R-CNN. https://arxiv.org/abs/1703.06870
7. Nielsen, M. A. (1970). Neural networks and deep learning. http://neuralnetworksanddeeplearning.com/
