In [1]:
%load_ext autoreload
%autoreload 2

+---------+------+
| categoy | AP50 |
+---------+------+
| cat     | 0.01 |
| dog     | 0.02 |
| mouse   | nan  |
+---------+------+


In [3]:
# Imports
import torch
from utils.data import build_dataset,build_xview_dataset, unwrap_collate_fn
from attrdict import AttrDict
from utils.group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler
from utils.fcos import fcos_resnet50_fpn
# from torchvision.models.detection import fcos_resnet50_fpn
from torchvision.models.detection import ssd300_vgg16

import datetime
import time
from tqdm import tqdm

from utils.engine import train_and_eval,eval_model
import torchvision
from pycocotools import mask as coco_mask
from pycocotools.coco import COCO
import math
from lr_schedulers import WarmupWrapper
from torch.optim.lr_scheduler import MultiStepLR

from utils.model import make_custom_object_detection_model_fcos, build_frcnn_model
import matplotlib.pyplot as plt
from train import load_dataset
import numpy as np

from determined.experimental import Determined

# remove warnings
import warnings
warnings.filterwarnings('ignore')

# Set up .detignore file so the checkpoints directory is not packaged into future experiments
!echo checkpoints > .detignore

  from .autonotebook import tqdm as notebook_tqdm


TORCHVISION_VERSION:  0.13.1+cu102 /opt/conda/lib/python3.8/site-packages/torchvision/__init__.py
TORCH_VERSION:  1.12.1+cu102 /opt/conda/lib/python3.8/site-packages/torch/__init__.py


In [26]:
def define_exp(lr=None,momentum=None,epochs=None):
    '''
    '''
    model = build_frcnn_model(dataset.num_classes)
    model.to(device)

    optimizer = torch.optim.SGD(
            model.parameters(),
            lr=lr,
            momentum=momentum,
            weight_decay=1e-4,
            nesterov="nesterov",
        )

    scheduler_cls = WarmupWrapper(MultiStepLR)
    scheduler = scheduler_cls(
        'linear',  # warmup schedule
        100,  # warmup_iters
        0.001,  # warmup_ratio
        optimizer,
        [177429, 236572],  # milestones
        0.1,  # gamma
    )
    print("Start training")
    start_time = time.time()

    losses, model = train_and_eval(model,data_loader,data_loader_test,optimizer,scheduler,device,cpu_device,epochs=epochs)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print(f"Training time {total_time_str}")

def visualize_pred(inv_tensor,res):
    '''
    '''
    img = Image.fromarray((255.*inv_tensor.permute((1,2,0)).numpy()).astype(np.uint8))
    draw = ImageDraw.Draw(img)
    for ind,(b,s,l) in enumerate(zip(res[1]['boxes'],res[1]['scores'],res[1]['labels'])):
        # print(b.detach().numpy(), s.detach().numpy())
        x,y,x2,y2 = b.detach().numpy()
        print( x,y,x2,y2,s.item(),l.item())
        draw.rectangle([x,y,x2,y2])
        if ind > 3:
            break

    plt.imshow(img)
def predict(model,images_t):
    '''
    '''
    cpu_device = torch.device('cpu')
    outputs = model(images_t)

    outputss = []
    for t in outputs:
        outputss.append({k: v.to(cpu_device) for k, v in t.items()})
    # model_time = time.time() - model_time
    res = {target["image_id"].item(): output for target, output in zip(targets_t, outputss)}
    visualize_pred(images_t,res)
    return img, res

<img src="https://raw.githubusercontent.com/determined-ai/determined/master/determined-logo.png" align='right' width=150 />

# Building a Geospatial Detection Model with Determined

<img src="https://www.cis.upenn.edu/~jshi/ped_html/images/PennPed00071_1.png" width=400 />


This notebook will walk through the benefits of building a Deep Learning model with Determined.  We will build an object detection model trained on the [Penn-Fudan Database for Pedestrian Detection and Segmentation](https://www.cis.upenn.edu/~jshi/ped_html/).


# Table of Contents


<font size="3">
<ol>
  <li>What Modeling looks like Today</li>
  <li>Building a model with Determined
    <ol>
      <li>Single GPU training</li>
      <li>Cluster-scale multi-GPU training</li>
      <li>Adapative hyperparameter search</li>
    </ol>
  </li>
</ol>
</font>

# What modeling looks like without Determined

<font size="4">First let's look at the kind of work modelers do today.  Below, we train a model we found on Github and modified, printing validation set metrics after each epoch.</font>

In [4]:
DEVICE='cuda'
#Data loading code
device = torch.device(DEVICE)
cpu_device = torch.device(DEVICE)
print("Loading data")
TRAIN_DATA_DIR='determined-ai-xview-coco-dataset/train_sliced_no_neg/train_images_300_02/'
VAL_DATA_DIR='determined-ai-xview-coco-dataset/val_sliced_no_neg/val_images_300_02/'

dataset, num_classes, dataset_test,data_loader, data_loader_test= load_dataset(TRAIN_DATA_DIR=TRAIN_DATA_DIR,VAL_DATA_DIR=VAL_DATA_DIR,train_batch_size=8,test_batch_size=8)
print("Create Model")
model = build_frcnn_model(dataset.num_classes)
model.to(device)

optimizer = torch.optim.SGD(
        model.parameters(),
        lr=0.01,
        momentum=0.9,
        weight_decay=1e-4,
        nesterov="nesterov",
    )

scheduler_cls = WarmupWrapper(MultiStepLR)
scheduler = scheduler_cls(
    'linear',  # warmup schedule
    100,  # warmup_iters
    0.001,  # warmup_ratio
    optimizer,
    [177429, 236572],  # milestones
    0.1,  # gamma
)


Loading data
PATHS:  {'train': ('determined-ai-xview-coco-dataset/train_sliced_no_neg/train_images_300_02/', '/tmp/train_sliced_no_neg/train_300_02_1k.json'), 'val': ('determined-ai-xview-coco-dataset/train_sliced_no_neg/train_images_300_02/', '/tmp/val_sliced_no_neg/val_300_02_1k.json')}
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
self.catIdtoCls:  {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 32, 32: 33, 33: 34, 34: 35, 35: 36, 36: 37, 37: 38, 38: 39, 39: 40, 40: 41, 41: 42, 42: 43, 43: 44, 44: 45, 45: 46, 46: 47, 47: 48, 48: 49, 49: 50, 50: 51, 51: 52, 52: 53, 53: 54, 54: 55, 55: 56, 56: 57, 57: 58, 58: 59, 59: 60}
--num_classes:  61
PATHS:  {'train': ('determined-ai-xview-coco-dataset/val_sliced_no_neg/val_images_300_02/', '/tmp/train_sliced_no_neg/train_300_

In [16]:
print("Start training")
start_time = time.time()

losses, model = train_and_eval(model,data_loader,data_loader_test,optimizer,scheduler,device,cpu_device,epochs=2)

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print(f"Training time {total_time_str}")

Start training
Epochs: 2
Epoch #: 0


100%|██████████| 125/125 [08:15<00:00,  3.96s/it, loss=['loss: 0.027', 'loss_classifier: 0.016', 'loss_box_reg: 0.008', 'loss_objectness: 0.002', 'loss_rpn_box_reg: 0.000']]
100%|██████████| 125/125 [03:43<00:00,  1.79s/it]

Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.231
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.802
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.161
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.231
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.260
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.380
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.380
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area


100%|██████████| 125/125 [08:16<00:00,  3.97s/it, loss=['loss: 0.056', 'loss_classifier: 0.032', 'loss_box_reg: 0.018', 'loss_objectness: 0.003', 'loss_rpn_box_reg: 0.002']]

Training time 0:20:17





<font size="4">We might also roll our own simple hyperparameter tuning:</font>

In [None]:
def hp_grid_search():
    for lr in np.logspace(-4, -2, num=10):
        for m in np.linspace(0.7, 0.95, num=10):
            print(f"Training model with learning rate {lr} and momentum {m}")
            define_exp(lr=lr,momentum=m,epochs=1)


try:
    hp_grid_search()
except KeyboardInterrupt:
    pass

# What's Missing?

<font size="4">This approach works in theory -- we could get a good model, save it, and use it for predictions.  But we're missing a lot from the ideal state:</font>
<font size="4">
<ul style="margin-top: 15px">
  <li style="margin-bottom: 10px">Distributed training</li>
  <li style="margin-bottom: 10px">Parallel search</li>
  <li style="margin-bottom: 10px">Intelligent checkpointing</li>
  <li style="margin-bottom: 10px">Interruptibility and fault tolerance</li>
  <li                            >Logging of experiment configurations and results </li>
</ul>
</font>

<font size=6><b>Scaled Experimentation with Determined</b></font>

With less work than setting up a limited random search, you can get started with Determined.

## Our First Experiment

For our first example, we run a simple single-GPU training job with fixed hyperparameters.

<img src="https://raw.githubusercontent.com/determined-ai/public_assets/main/images/StartAnExperiment.png" align=left width=330/>

In [18]:
!det e create const-distributed.yaml .

Preparing files (/run/determined/workdir/xview-torchvision-coco) to send to master... 23.1KB and 6 files 
Traceback (most recent call last):
  File "/run/determined/pythonuserbase/lib/python3.8/site-packages/determined/cli/cli.py", line 261, in main
    parsed_args.func(parsed_args)
  File "/run/determined/pythonuserbase/lib/python3.8/site-packages/determined/cli/experiment.py", line 204, in create
    submit_experiment(args)
  File "/run/determined/pythonuserbase/lib/python3.8/site-packages/determined/common/api/authentication.py", line 403, in f
    return func(namespace)
  File "/run/determined/pythonuserbase/lib/python3.8/site-packages/determined/cli/experiment.py", line 136, in submit_experiment
    model_context = context.read_legacy_context(args.model_def)
  File "/run/determined/pythonuserbase/lib/python3.8/site-packages/determined/common/context.py", line 171, in read_legacy_context
    return [v1File_to_dict(f) for f in read_v1_context(local_path, limit)]
  File "/run/determi

And evaluate its performance:

In [30]:
experiment_id = 663


In [32]:
checkpoint = Determined().get_experiment(experiment_id).top_checkpoint()
model = checkpoint.load().model

Let's see the prediction of the trained model

In [27]:
images_t, targets_t= list(data_loader_test)[0]
predict(model,images_t)

AssertionError: targets should not be none when in training mode

In [None]:
# predict(model, 'test.jpg', 0.5)

## Scaling up to Distributed Training

Determined makes it trivial to move from single-GPU to multi-GPU (and even multi-node) training. Here we'll simply modify the config above to request 8 GPUs instead of 1, and increase the global batch size to increase the data throughput 

In [19]:
!cat const-distributed.yaml

# name: resnet_fpn_fcos_coco_dist_warmup_2_agents
name: resnet_fpn_fcos_xview_dist_warmup
profiling:
 enabled: true
 begin_on_batch: 0
 end_after_batch: null
hyperparameters:
    # These settings match that for the 150 epoch run provided in the original repo:
    #   https://github.com/facebookresearch/detr
    lr: 0.01
    # lr: 0.02
    momentum: 0.9
    # global_batch_size: 32
    global_batch_size: 16
    weight_decay: 1.0e-4
    gamma: 0.1
    warmup: linear
    warmup_iters: 1000
    warmup_ratio: 0.001
    step1: 177429 # 3 epochs: 3*59143 == 177429
    step2: 236572 # 4 epochs: 4*59143 == 236572
    # step1: 946288 # 16 epochs: 16*59143 == 946,288
    # step2: 1301146 # 22 epochs: 22*59143 == 1,301,146
    # model: mv3_fcos
    model: resnet_fcos
    # Dataset
    dataset_file: coco
    backend: aws # specifiy the backend you want to use.  one of: gcs, aws, fake, local
    data_dir: determined-ai-coco-dataset # bucket name if using gcs or aws, otherwise directory to dataset
   

In [None]:
!det experiment create distributed.yaml .

<img src="https://raw.githubusercontent.com/determined-ai/public_assets/main/images/4GPUexperiment.png" align=left width=530 />

## Run Distributed Hyperparameter Tuning

By simply building a config file and adapting our code to meet the determined trial interface, we can conduct a sophisticated hyperparamter search.  Instructions for how to configure different types of experiments [can be found in the Determined documentation.](https://docs.determined.ai/latest/how-to/index.html)

In [None]:
!cat search.yaml

## Create your Experiment

Now that you've described your experiment, you'll simply need to use the command line interface to submit it to the Determined Cluster.  

In [None]:
!det experiment create search.yaml .

<img src="https://raw.githubusercontent.com/determined-ai/public_assets/main/images/12GPUexperiment.png" align=left width=800 />

# Model Registry

After training, we'll want to actually use our model in some sort of system.  Determined provides a model registry to version your trained models, making them easy to retrieve for inference.

In [None]:
experiment_id = <Enter Experiment ID>
MODEL_NAME = "pedestrian-detection"

In [None]:
# Get the best checkpoint from the training
checkpoint = Determined().get_experiment(experiment_id).top_checkpoint()

In [None]:
model = check_model(MODEL_NAME)

In [None]:
model.register_version(checkpoint.uuid)

# Inference

Once your model is versioned in the model registry, using that model for inference is straightforward:

In [None]:
# Retrieve latest checkpoint for a given model name
latest_version = model.get_version()

In [None]:
# Load the model checkpoint into memory
inference_model = latest_version.checkpoint.load().model

In [None]:
# Run inference as before
predict(inference_model, 'test.jpg')