### Faster RCNN Installation
* In production, we want to install the custom faster rcnn without cloning the whole source repository. This is an example of how to do that.

In [None]:
# https://adamj.eu/tech/2019/03/11/pip-install-from-a-git-repository/
# Install customized linc-detector faster-rcnn
# !pip install --upgrade --force-reinstall git+https://github.com/linc-lion/LINC-detector.git@ba36a5bfa5ba7b9035977c02b1d8ed253f074e8d

### Working Directory
* The jupyter notebook is in the same repository as the source. So, we are switching to the parent directory as working directory so that imports work.

In [1]:
# Get parent directory
if 'parent_dir' not in globals():
    import os
    current_dir = os.getcwd()
    parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
parent_dir

'/root/LINC-detector'

In [2]:
cd $parent_dir

/root/LINC-detector


### Requirements installation

In [3]:
pip install -r requirements.txt

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
# Install pycocotools
!pip install --upgrade --force-reinstall cython
!pip install --upgrade --force-reinstall -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
!pip install setuptools==59.5.0 # https://github.com/pytorch/pytorch/issues/69894

Collecting cython
  Using cached Cython-0.29.32-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Installing collected packages: cython
  Attempting uninstall: cython
    Found existing installation: Cython 0.29.32
    Uninstalling Cython-0.29.32:
      Successfully uninstalled Cython-0.29.32
Successfully installed cython-0.29.32
[0mCollecting git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI
  Cloning https://github.com/cocodataset/cocoapi.git to /tmp/pip-req-build-ku3h01o0
  Running command git clone --filter=blob:none --quiet https://github.com/cocodataset/cocoapi.git /tmp/pip-req-build-ku3h01o0
  Resolved https://github.com/cocodataset/cocoapi.git to commit 8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting setuptools>=18.0
  Using cached setuptools-65.4.1-py3-none-any.whl (1.2 MB)
Collecting cython>=0.27.3
  Using cached Cython-0.29.32-cp38-cp38-manylinux_2_17_x86_64.ma

### Create training datasets

In [9]:
# !python datasets/create_all_but_ws_and_fb_dataset.py

Parsing xml files in ../images/Annotated/Verified_Annotation............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### Resolve SageMaker specific bug (optional if not run on SageMaker)
* https://github.com/aws/amazon-sagemaker-examples/issues/3154

In [5]:
%%capture
import IPython
import sys

!{sys.executable} -m pip install ipywidgets
IPython.Application.instance().kernel.do_shutdown(True)  # has to restart kernel so changes are used

### Imports

In [5]:
import datetime
import os
import time
import shutil
import subprocess
import sys

import torch
import torch.utils.data
from linc.detector.models import detection
from torch.utils.tensorboard import SummaryWriter

from linc.detector.helper.coco_utils import get_coco  # get_coco_kp
from linc.detector.helper.group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from linc.detector.helper.engine import train_one_epoch, evaluate
from linc.detector.helper import utils

%load_ext tensorboard

### Model Training
* The base training code is extracted from [pytorch example](https://github.com/pytorch/vision/blob/528651a031a08f9f97cc75bd619a326387708219/references/detection/train.py)
* The training takes around 3 hours to complete on a `ml.g4dn.2xlarge` AWS instance (USD $0.94 per hour)

In [8]:
# Path to COCO formatted object detection dataset
data_path = '../coco_all_but_ws_and_fb/'  

# Ignorable arguments
epochs = 35
save_every_num_epochs = None  # Optional
evaluate_every_num_epochs = 2
lr = 0.01
momentum = 0.9
weight_decay = 1e-4
lr_steps = [10, 11]
lr_gamma = 0.1
batch_size = 3
workers = 2
run_name = None  # Optional, str used to name Tensorboard summaries
num_draw_predictions = 5
draw_threshold = 0.5
aspect_ratio_group_factor = 0

In [9]:
print("Create summary writer for Tensorboard")
if run_name:
    log_dir_path = f"{run_name}" if run_name else None
    if os.path.isdir(log_dir_path):
        delete = input(f"Summary folder '{log_dir_path}' already exists. Overwrite it [yes, y / no, n]?")
        if delete in ('yes', 'y'):
            shutil.rmtree(log_dir_path)
        else:
            print(f"Chose another run name or delete the folder then!")
            exit()
else:
    log_dir_path = None
writer = SummaryWriter(log_dir=log_dir_path)

# Add some useful text summaries (Tensorboard uses markdown to render text).
# writer.add_text('Command executed', f"python {' '.join(sys.argv)}")
# writer.add_text('Arguments', str(args).replace(", ", ",  \n").replace("Namespace(", "").replace(")", ""))

print("Create datasets")
dataset, num_classes, label_names = get_coco(data_path, image_set='train')
print(f"Categorizing into {num_classes} classes")
dataset_test, _, _ = get_coco(data_path, image_set='val')

print("Create samplers")
train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
group_ids = create_aspect_ratio_groups(dataset, k=aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size)

print("Create dataloaders")
data_loader = torch.utils.data.DataLoader(dataset, 
                                          batch_sampler=train_batch_sampler, 
                                          num_workers=workers, 
                                          collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(dataset_test, 
                                               batch_size=1,
                                               sampler=test_sampler, 
                                               num_workers=workers, 
                                               collate_fn=utils.collate_fn)

print("Create model")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
model = detection.__dict__['fasterrcnn_resnet50_fpn'](num_classes=num_classes, pretrained=False)
model.to(device)
model_without_ddp = model

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, 
                            lr=lr, 
                            momentum=momentum, 
                            weight_decay=weight_decay)

lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
                                                    milestones=lr_steps, 
                                                    gamma=lr_gamma)

print("Start training")
start_time = time.time()
for epoch in range(epochs):
    start_epoch = time.time()
    train_one_epoch(
        model, optimizer, data_loader, device, epoch, 20, writer, label_names
    )
    print(f"Epoch time {time.time() - start_epoch}")
    writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch)
    lr_scheduler.step()

    if save_every_num_epochs and epoch % save_every_num_epochs == 0:
        utils.save_on_master({
            'model': model_without_ddp.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'args': args,
            'label_names': label_names},
            os.path.join(writer.log_dir, 'model_{}.pth'.format(epoch))
        )

    if epoch % evaluate_every_num_epochs == 0:
        evaluate(
            model, data_loader_test, epoch, writer, draw_threshold,
            label_names, num_draw_predictions, device=device
        )

# Save after training is done
utils.save_on_master({
    'model': model_without_ddp.state_dict(),
    'optimizer': optimizer.state_dict(),
    'lr_scheduler': lr_scheduler.state_dict(),
    'label_names': label_names},
    os.path.join(writer.log_dir, 'model_finished.pth')
)

writer.close()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))


Create summary writer for Tensorboard
Create datasets
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Categorizing into 32 classes
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Create samplers
Using [0, 1.0, inf] as bins for aspect ratio quantization
Count of instances per bin: [614 245]
Create dataloaders
Create model
Device: cuda
Start training
Epoch: [0]  [  0/286]  eta: 0:18:25  lr: 0.000045  loss: 4.1504 (4.1504)  loss_classifier: 3.3507 (3.3507)  loss_box_reg: 0.0352 (0.0352)  loss_objectness: 0.6949 (0.6949)  loss_rpn_box_reg: 0.0696 (0.0696)  time: 3.8653  data: 2.8682  max mem: 6067
Epoch: [0]  [ 20/286]  eta: 0:05:18  lr: 0.000746  loss: 3.2944 (3.0890)  loss_classifier: 2.5002 (2.2449)  loss_box_reg: 0.0549 (0.0621)  loss_objectness: 0.6905 (0.6694)  loss_rpn_box_reg: 0.0757 (0.1127)  time: 1.0649  data: 0.2327  max mem: 6678
Epoch: [0]  [ 40/286]  eta: 0:04:18  lr: 0.001447  loss: 1.2066 (2.1822)  loss_

In [None]:
%tensorboard --logdir "./runs"