In [None]:
# Prerequired
# https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects

cd container/local_test/test_dir/input/data/train
cp ~/SageMaker/myAWSStudyBlog/yolo/5-COCO-to-Yolo/train/*.* .

cd container/local_test/test_dir/input/data/weights
wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137

cd container/local_test/test_dir/input/data/cfg
cp yolov4-custom.cfg yolo-obj.cfg

## Local build the docker images

In [None]:
$(aws ecr get-login --registry-ids 763104351884 --no-include-email)
cd container
docker build -t yolo .

# ## cpu to debug the train
# docker run -it yolo:latest /bin/bash

# ## if gpu
# nvidia-docker run -it yolo:latest /bin/bash

In [None]:
## local test
cd local_test
nvidia-docker run -v $(pwd)/test_dir:/opt/ml -it yolo:latest /bin/bash

## create symbolic link (mapping data to /opt/ml/code)
ln -s /opt/ml/input/data/ data
ln -s /opt/ml/model/ model

```
root@d6a5f8ef55c3:/opt/ml/code# ls data/
cfg  names  train  valid  weights
```

In [None]:
darknet detector train data/cfg/obj.data data/cfg/yolo-obj.cfg data/weights/yolov4.conv.137

# Debug

In [None]:
## Look darknet compiled did not find libopencv, set mosaic=0 first

root@199b19ff7907:/opt/ml/code# darknet detector train data/cfg/obj.data data/cfg/yolo-obj.cfg data/weights/yolov4.conv.137 CUDA-version: 10000 (10020), GPU count: 1
 OpenCV isn't used - data augmentation will be slow
valid: Using default 'data/train/train.txt'
yolo-obj
 0 : compute_capability = 700, cudnn_half = 0, GPU: Tesla V100-SXM2-16GB
net.optimized_memory = 0

......
 608 x 608
 Create 64 permanent cpu-threads

 mosaic=1 - compile Darknet with OpenCV for using mosaic=1

In [None]:
## training

1143: 0.486333, 0.441965 avg loss, 0.001000 rate, 5.488336 seconds, 73152 images, 0.946526 hours left
1151: 0.402091, 0.425088 avg loss, 0.001000 rate, 5.053686 seconds, 73664 images, 0.974063 hours left
1217: 0.260138, 0.296006 avg loss, 0.001000 rate, 4.625333 seconds, 77888 images, 0.960187 hours left
2000: 0.125233, 0.148810 avg loss, 0.000010 rate, 5.519563 seconds, 128000 images, 0.108507 hours left

## Build and push to ECR

In [1]:
!cat container/Dockerfile

# SageMaker PyTorch image
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.15.2-gpu-py36-cu100-ubuntu18.04

ENV PATH="/opt/ml/code:${PATH}"

## install cmake 3.17 && opencv
RUN apt remove cmake
RUN apt-get update \
 && apt-get install -y --no-install-recommends --allow-unauthenticated \
    python3-opencv
RUN curl -sSL https://github.com/Kitware/CMake/releases/download/v3.17.2/cmake-3.17.2-Linux-x86_64.tar.gz | tar -xzC /opt
RUN mv /opt/cmake-3.17.2-Linux-x86_64 /opt/cmake
RUN ln -s /opt/cmake/bin/cmake /usr/bin/cmake

## fix /usr/local/cuda-10.0/compat/libcuda.so
RUN bash -c 'echo "/usr/local/cuda-10.0/compat" > /etc/ld.so.conf.d/cuda.conf'
RUN ldconfig -v

## Build darknet
RUN cd opt && git clone https://github.com/AlexeyAB/darknet
RUN cd /opt/darknet && ./build.sh

# /opt/ml and all subdirectories are utilized by SageMaker, we use the /code subdirectory to store our user code.
#COPY /darknet /opt/ml/code

# this environment variable is

In [2]:
!cat ./build-and-push.sh

#!/bin/bash

# The name of our algorithm
algorithm_name=yolo-tensorflow

cd container

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Get the login command from ECR in order to pull down the SageMaker PyTorch image
$(aws ecr get-login --registry-ids 763104351884 --region ${region} --no-include-email)

# Build the docker image locally with the 

# SageMaker PyTorch Container

https://github.com/aws/sagemaker-pytorch-container/tree/master