# Demo training code

This is a simplified version of the training code to be used as a demo. The more complete version, with more options, is in the `train.py` cli script. Checkpoints and tensorboard summaries are saved in `notebooks/runs/`.

## Arguments

In [1]:
# Path to COCO formatted object dataset
data_path = '../data/all_but_ws_and_fb_fixed/'  

# Ignorable arguments
epochs = 35
save_every_num_epochs = None  # Optional
evaluate_every_num_epochs = 2
lr = 0.01
momentum = 0.9
weight_decay = 1e-4
lr_steps = [10, 11]
lr_gamma = 0.1
batch_size = 3
workers = 8
run_name = None  # Optional, str used to name Tensorboard summaries
num_draw_predictions = 5
draw_threshold = 0.5

## Code

In [2]:
# Notebooks are stored in 'notebooks/' which breaks my imports
import sys
sys.path.insert(0, '..')

import os
import datetime
import time
import shutil

import torch
import torch.utils.data
from models import detection
from torch.utils.tensorboard import SummaryWriter

from coco_utils import get_coco  # get_coco_kp

from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from engine import train_one_epoch, evaluate

import utils

In [None]:
# Create summary writer for Tensorboard
if run_name:
    log_dir_path = f"runs/{run_name}" if run_name else None
    if os.path.isdir(log_dir_path):
        delete = input(f"Summary folder '{log_dir_path}' already exists. Overwrite it [yes, y / no, n]?")
        if delete in ('yes', 'y'):
            shutil.rmtree(log_dir_path)
        else:
            print(f"Chose another run name or delete the folder then!")
            exit()
else:
    log_dir_path = None
writer = SummaryWriter(log_dir=log_dir_path)

# Create datasets
dataset, num_classes, label_names = get_coco(data_path, image_set='train')
print(f"Categorizing into {num_classes} classes")
dataset_test, _, _ = get_coco(data_path, image_set='val')

# Create samplers
train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
group_ids = create_aspect_ratio_groups(dataset)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size)

# Create dataloaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_sampler=train_batch_sampler, num_workers=workers,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1,
    sampler=test_sampler, num_workers=workers,
    collate_fn=utils.collate_fn)

# Create model
device = torch.device('cuda' if torch.has_cuda else 'cpu')
model = detection.fasterrcnn_resnet50_fpn(num_classes=num_classes, pretrained=False)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params, lr=lr, momentum=momentum, weight_decay=weight_decay)

lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=lr_steps, gamma=lr_gamma
)

# Train
print("Start training")
start_time = time.time()
for epoch in range(epochs):
    start_epoch = time.time()
    train_one_epoch(
        model, optimizer, data_loader, device, epoch, 20, writer, label_names
    )
    print(f"Epoch time {time.time() - start_epoch}")
    writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch)
    lr_scheduler.step()

    if save_every_num_epochs and epoch % save_every_num_epochs == 0:
        utils.save_on_master({
            'model': model_without_ddp.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'label_names': label_names},
            os.path.join(writer.log_dir, 'model_{}.pth'.format(epoch))
        )

    if epoch % evaluate_every_num_epochs == 0:
        evaluate(
            model, data_loader_test, epoch, writer, draw_threshold,
            label_names, num_draw_predictions, device=device
        )

# Save final checkpoint after training is done
utils.save_on_master({
    'model': model_without_ddp.state_dict(),
    'optimizer': optimizer.state_dict(),
    'lr_scheduler': lr_scheduler.state_dict(),
    'label_names': label_names},
    os.path.join(writer.log_dir, 'model_finished.pth')
)

writer.close()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Categorizing into 32 classes
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Using [0, 1.0, inf] as bins for aspect ratio quantization
Count of instances per bin: [613 247]
Start training
Epoch: [0]  [  0/286]  eta: 0:15:35  lr: 0.000045  loss: 4.1957 (4.1957)  loss_classifier: 3.4428 (3.4428)  loss_box_reg: 0.0293 (0.0293)  loss_objectness: 0.6919 (0.6919)  loss_rpn_box_reg: 0.0316 (0.0316)  time: 3.2692  data: 2.2239  max mem: 4725
Epoch: [0]  [ 20/286]  eta: 0:03:35  lr: 0.000746  loss: 4.1410 (4.1238)  loss_classifier: 3.3417 (3.2988)  loss_box_reg: 0.0476 (0.0523)  loss_objectness: 0.6903 (0.6899)  loss_rpn_box_reg: 0.0789 (0.0829)  time: 0.6876  data: 0.1550  max mem: 6133
Epoch: [0]  [ 40/286]  eta: 0:02:51  lr: 0.001447  loss: 1.2647 (2.9218)  loss_classifier: 0.6217 (2.1657)  loss_box_reg: 0.0928 (0.0760)  loss_objectness: 0.4158 (0.5900)  loss_rpn_box_reg: 0.0

Epoch: [1]  [180/286]  eta: 0:01:04  lr: 0.010000  loss: 0.9668 (1.1218)  loss_classifier: 0.6106 (0.7097)  loss_box_reg: 0.2535 (0.2913)  loss_objectness: 0.0262 (0.0456)  loss_rpn_box_reg: 0.0618 (0.0752)  time: 0.5624  data: 0.0440  max mem: 6148
Epoch: [1]  [200/286]  eta: 0:00:52  lr: 0.010000  loss: 1.0323 (1.1173)  loss_classifier: 0.6329 (0.7040)  loss_box_reg: 0.3007 (0.2917)  loss_objectness: 0.0323 (0.0456)  loss_rpn_box_reg: 0.0642 (0.0760)  time: 0.6119  data: 0.0664  max mem: 6148
Epoch: [1]  [220/286]  eta: 0:00:40  lr: 0.010000  loss: 0.9862 (1.1080)  loss_classifier: 0.5820 (0.6947)  loss_box_reg: 0.2737 (0.2895)  loss_objectness: 0.0394 (0.0455)  loss_rpn_box_reg: 0.0712 (0.0784)  time: 0.5876  data: 0.0494  max mem: 6148
Epoch: [1]  [240/286]  eta: 0:00:27  lr: 0.010000  loss: 0.7620 (1.0819)  loss_classifier: 0.4767 (0.6769)  loss_box_reg: 0.2095 (0.2836)  loss_objectness: 0.0274 (0.0441)  loss_rpn_box_reg: 0.0604 (0.0772)  time: 0.5790  data: 0.0532  max mem: 6148


Epoch: [3]  [ 60/286]  eta: 0:02:46  lr: 0.010000  loss: 0.7426 (0.7277)  loss_classifier: 0.4051 (0.4294)  loss_box_reg: 0.2150 (0.2137)  loss_objectness: 0.0175 (0.0207)  loss_rpn_box_reg: 0.0591 (0.0639)  time: 0.6419  data: 0.0921  max mem: 6230
Epoch: [3]  [ 80/286]  eta: 0:02:21  lr: 0.010000  loss: 0.6468 (0.7160)  loss_classifier: 0.3745 (0.4188)  loss_box_reg: 0.2050 (0.2147)  loss_objectness: 0.0154 (0.0199)  loss_rpn_box_reg: 0.0434 (0.0626)  time: 0.5442  data: 0.0446  max mem: 6230
Epoch: [3]  [100/286]  eta: 0:02:07  lr: 0.010000  loss: 0.6650 (0.7110)  loss_classifier: 0.3967 (0.4146)  loss_box_reg: 0.2147 (0.2136)  loss_objectness: 0.0152 (0.0198)  loss_rpn_box_reg: 0.0548 (0.0630)  time: 0.6832  data: 0.1604  max mem: 6230
Epoch: [3]  [120/286]  eta: 0:01:52  lr: 0.010000  loss: 0.6538 (0.7029)  loss_classifier: 0.3736 (0.4083)  loss_box_reg: 0.2145 (0.2129)  loss_objectness: 0.0124 (0.0193)  loss_rpn_box_reg: 0.0507 (0.0623)  time: 0.6405  data: 0.1156  max mem: 6230
