# EfficientDet Training On ASL Dataset
---
 
## Set up on Nvidia Jetson NX

- Using Docker Image Inference which uses the base image nvcr.io/nvidia/l4t-pytorch:r32.6.1-pth1.9-py3 from the Dockerfile.yolov5
- Building Docker image

```bash
docker build -t yolov5 -f Dockerfile.yolov5 . 
```
- Run the container and mounting data in the /app/ folder

```bash
docker run -ti --rm --runtime nvidia  --device /dev/video0 --network host --privileged -e DISPLAY=$DISPLAY -v /data/w251:/app/w251 yolov5
```
- Run the following line before spinning up Jupyter to update Jupyter config

```bash
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
```

### 0. Install Requirements & Data

In [None]:
!pwd

In [None]:
## Download and upzip annotated data from roboflow
!curl -L "https://app.roboflow.com/ds/3ZcnERAu61?key=8mQAZkmZt5" -o roboflow.zip

In [None]:
!ls

In [None]:
!mkdir -p Yet-Another-EfficientDet-Pytorch/datasets

In [None]:
!mkdir -p Yet-Another-EfficientDet-Pytorch/datasets/asl

In [None]:
!unzip roboflow.zip -d Yet-Another-EfficientDet-Pytorch/datasets/asl

In [None]:
!rm roboflow.zip

In [None]:
!ls

In [None]:
import os
import sys

if "projects" not in os.getcwd():
  os.chdir('Yet-Another-EfficientDet-Pytorch')
  sys.path.append('.')
else:
  pass

In [None]:
!pwd

In [None]:
!mv datasets/asl/valid datasets/asl/val
!mkdir datasets/asl/annotations
!mv datasets/asl/train/_annotations.coco.json datasets/asl/annotations/instances_train.json
!mv datasets/asl/test/_annotations.coco.json datasets/asl/annotations/instances_test.json
!mv datasets/asl/val/_annotations.coco.json datasets/asl/annotations/instances_val.json


## 1. Prepare Custom Dataset/Pretrained Weights

In [None]:
# download pretrained weights
! mkdir weights
#! wget https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch/releases/download/1.0/efficientdet-d1.pth -O weights/efficientdet-d1.pth

# prepare project file projects/asl.yml
# showing its contents here
! cat projects/asl.yml

In [None]:
import torch
torch.cuda.empty_cache()
torch.cuda.is_available()

## 2. Evaluate & Inference on Nvidia Jetson NX

In [None]:
#get latest weight file
%cd logs/asl
weight_file = !ls -Art | grep efficientdet
%cd ../..

print(weight_file)

In [None]:
!ls

In [None]:
print(weight_file[-1])

In [None]:
# Evaluate with PF32
!python3 coco_eval.py -c 0 -p asl -w "logs/asl/{weight_file[-1]}"

### a. Inference with images

In [None]:
import torch
from torch.backends import cudnn

from backbone import EfficientDetBackbone
import cv2
import matplotlib.pyplot as plt
import numpy as np

from efficientdet.utils import BBoxTransform, ClipBoxes
from utils.utils import preprocess, invert_affine, postprocess

import pathlib

compound_coef = 0
force_input_size = None  # set None to use default size

imgdir_path = pathlib.Path('datasets/asl/test')
# img_path = 'datasets/asl/test/thanks-46_jpg.rf.3e299223f7df6ed6916e0e1d862159ea.jpg'
img_paths = [str(path) for path in imgdir_path.glob('*.jpg')]
print('Number of pictures in test folder:', len(img_paths))
img_paths = img_paths[:50]

threshold = 0.2
iou_threshold = 0.2

use_cuda = True
use_float16 =False
cudnn.fastest = False
cudnn.benchmark = True

obj_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
            'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'four', 
            'hello', 'help', 'one', 'right', 'thanks', 'three', 'two', 'zero' ]

# tf bilinear interpolation is different from any other's, just make do
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size
print('input size:', input_size,'\n')

for idx, img_path in enumerate(img_paths):
    print(img_path)
    ori_imgs, framed_imgs, framed_metas = preprocess(img_path, max_size=input_size)

    if use_cuda:
        x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
    else:
        x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)

    model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list),

    # replace this part with your project's anchor config
    ratios=[(1.0, 1.0), (1.3, 0.8), (1.9, 0.5)],
    scales=[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])

    model.load_state_dict(torch.load('logs/asl/'+weight_file[-1]))
    model.requires_grad_(False)
    model.eval()

    if use_cuda:
        model = model.cuda()
    if use_float16:
        model = model.half()

    with torch.no_grad():
        features, regression, classification, anchors = model(x)

        regressBoxes = BBoxTransform()
        clipBoxes = ClipBoxes()

        out = postprocess(x,
                          anchors, regression, classification,
                          regressBoxes, clipBoxes,
                          threshold, iou_threshold)

    out = invert_affine(framed_metas, out)

    for i in range(len(ori_imgs)):
        if len(out[i]['rois']) == 0:
            continue
        ori_imgs[i] = ori_imgs[i].copy()
        scores = []
        objs = []
        for j in range(len(out[i]['rois'])):
    #         (x1, y1, x2, y2) = out[i]['rois'][j].astype(np.int)
    #         cv2.rectangle(ori_imgs[i], (x1, y1), (x2, y2), (255, 255, 0), 1)
            obj = obj_list[out[i]['class_ids'][j]]
            objs.append(obj)
            score = float(out[i]['scores'][j])
            scores.append(score)
            print('{}, {:.3f}'.format(obj, score))

        (x1, y1, x2, y2) = out[i]['rois'][0].astype(np.int)
        cv2.rectangle(ori_imgs[i], (x1, y1), (x2, y2), (255, 255, 0), 1)

        cv2.putText(ori_imgs[i], '{}, {:.3f}'.format(objs[0], scores[0]),
                    (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (255, 255, 0), 1)
        plt.imshow(ori_imgs[i])
        # image saving
        cv2.imwrite('test/asl_output_{}.png'.format(idx+1), ori_imgs[i])

### b. Inference with Live Camera

In [None]:
import time
import torch
import cv2
import numpy as np
from torch.backends import cudnn
from backbone import EfficientDetBackbone
from efficientdet.utils import BBoxTransform, ClipBoxes
from utils.utils import preprocess, invert_affine, postprocess, preprocess_video

# Video's path
# use gstreamer for video directly; set the fps
video_src ='v4l2src device=/dev/video0 ! video/x-raw,framerate=30/1 ! videoconvert ! video/x-raw, format=BGR ! appsink'
#video_src = 0

compound_coef = 0
force_input_size = None  # set None to use default size

threshold = 0.2
iou_threshold = 0.2

use_cuda = True
use_float16 = False
cudnn.fastest = True
cudnn.benchmark = True

obj_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
            'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'four', 
            'hello', 'help', 'one', 'right', 'thanks', 'three', 'two', 'zero' ]

# tf bilinear interpolation is different from any other's, just make do
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size

# load model
model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list))
model.load_state_dict(torch.load('logs/asl/'+weight_file[-1]))
model.requires_grad_(False)
model.eval()

if use_cuda:
    model = model.cuda()
if use_float16:
    model = model.half()

# function for display
def display(preds, imgs):
    for i in range(len(imgs)):
        if len(preds[i]['rois']) == 0:
            return imgs[i]

        scores = []
        objs = []
        for j in range(len(preds[i]['rois'])):
            (x1, y1, x2, y2) = preds[i]['rois'][j].astype(np.int)
            cv2.rectangle(imgs[i], (x1, y1), (x2, y2), (255, 255, 0), 1)
            obj = obj_list[preds[i]['class_ids'][j]]
            score = float(preds[i]['scores'][j])
            objs.append(obj)
            scores.append(score)

            cv2.putText(imgs[i], '{}, {:.3f}'.format(objs[0], scores[0]),
                        (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                        (255, 255, 0), 1)
        
        return imgs[i]
# Box
regressBoxes = BBoxTransform()
clipBoxes = ClipBoxes()

# Video capture
cap = cv2.VideoCapture(video_src)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # frame preprocessing
    ori_imgs, framed_imgs, framed_metas = preprocess_video(frame, max_size=input_size)

    if use_cuda:
        x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
    else:
        x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)

    # model predict
    with torch.no_grad():
        features, regression, classification, anchors = model(x)

        out = postprocess(x,
                        anchors, regression, classification,
                        regressBoxes, clipBoxes,
                        threshold, iou_threshold)

    # result
    out = invert_affine(framed_metas, out)
    img_show = display(out, ori_imgs)

    # show frame by frame
    cv2.imshow('frame',img_show)
    if cv2.waitKey(1) & 0xFF == ord('q'): 
        break

cap.release()
cv2.destroyAllWindows()