# Face detection application using the FaceDetect purpose built model from NVIDIA GPU Cloud (NGC): client app

Load the dependencies.

In [None]:
import cv2
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

## Capturing a test image

Provide an image name.

In [None]:
image_name = "capture.jpg"

Get an image from a webcam. A window with camera feed will appear, press 's' button on your keyboard in order to capture an image of yourself.

In [None]:
camera = cv2.VideoCapture(0)
while True:
    return_value, capture = camera.read()
    cv2.imshow('capture', capture)
    if cv2.waitKey(1)& 0xFF == ord('s'):
        cv2.imwrite(image_name, capture)
        break
camera.release()
cv2.destroyAllWindows()

A function to display your image.

In [None]:
def display_image(img):
    dpi = 80
    height, width, depth = img.shape
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)
    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])
    plt.axis("off")
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()

In [None]:
display_image(capture)
print(capture.shape)

If you do not have a web camera, you can take any JPG image of a person, put it to the `client` directory of this project and rename it as `capture.jpg`. Note, that the preprocessing function of this demo has been designed assuming that image width is larger than image height (works best with horizontal images).

## Image pre-processing

The following code will load and modify your original image to make it compatible with the network. FaceDetect model expects images with width = 736 and height = 416. This information is available in the corresponding [model card](https://ngc.nvidia.com/catalog/models/nvidia:tlt_facenet). 

In [None]:
def im_resize_and_pad(image, w, h):
    # this example assumes that image width > image height

    ratio = h/image.size[1]
    image_resized = image.resize((int(image.size[0]*ratio),h), Image.BILINEAR)
    
    new_im = Image.new("RGB", (w, h))
    temp_width_cache = image_resized.size[0] # will be useful in the end to restore the original size

    # if the new width is larger than target w it will be cropped, otherwise padded
    new_im.paste(image_resized, ((w - image_resized.size[0])//2,0))    

    return new_im, ratio, temp_width_cache


def process_image(image_name, w, h):
    img = Image.open(image_name)
    # resize image keeping proportions
    image_resized, ratio, temp_width_cache = im_resize_and_pad(img, w, h)
    im = np.array(image_resized, dtype="float32")
    im = np.rollaxis(im, axis=2)
    im = np.expand_dims(im, axis=0)
    # Normalize to [0.0, 1.0] interval (expected by model)
    im = (1.0 / 255.0) * im
    return im, ratio, temp_width_cache


model_h = 416
model_w = 736

im, ratio, temp_width_cache = process_image(image_name, model_w, model_h)

print(im.shape)
print(ratio)
print(temp_width_cache)

Batched input (we will be using the same image to create a batch just for the purpose of performance demonstration). Note, that the bacth size value should not be bigger than the max value you specified in parameter `-m` when converting the model with `tlt-converter`.

In [None]:
batch_size = 32

im_batch = np.concatenate([im[:]]*batch_size)
print(im_batch.shape)

## Executing inference

In [None]:
import sys
import tritonclient.grpc as tritongrpcclient

url = 'localhost:8001'
model_name = 'facedetect_tlt'

try:
    triton_client = tritongrpcclient.InferenceServerClient(
        url=url,
        verbose=False
    )
except Exception as e:
    print("channel creation failed: " + str(e))
    sys.exit()

inputs = []
outputs = []

inputs.append(tritongrpcclient.InferInput('input_1', im_batch.shape, "FP32"))
inputs[0].set_data_from_numpy(im_batch)

outputs.append(tritongrpcclient.InferRequestedOutput('output_bbox/BiasAdd'))
outputs.append(tritongrpcclient.InferRequestedOutput('output_cov/Sigmoid'))

results = triton_client.infer(model_name=model_name,
                              inputs=inputs,
                              outputs=outputs)

## Parsing outputs and post-processing

Extract results as numpy arrays.

In [None]:
bboxes = results.as_numpy('output_bbox/BiasAdd')
print(bboxes.shape)
scores = results.as_numpy('output_cov/Sigmoid')
print(scores.shape)

Set model related patameters. Box scale and offset are are parameters which were used to train the model with Transfer Learning Toolkit. These values are also listed in the [model card](https://ngc.nvidia.com/catalog/models/nvidia:tlt_facenet). 

In [None]:
box_scale = 35.0   
box_offset = 0.5  

grid_h = 26
grid_w = 46

# Threshold for detection score
threshold = 0.5

Parse the list of detected bounding boxes and translate the values into the coordinate system of the pre-processed image. Note, that we are not going through the whole batch, since we just used multiple copies of the same image. You can set any value of `batch_idx` from the interval `[0, batch_size)`.

In [None]:
cell_width  = model_w  / grid_w
cell_height = model_h / grid_h

bboxes_list = []
scores_list = []

batch_idx = 15

for h in range (grid_h):
    for w in range (grid_w):
        score = scores[batch_idx, 0, h, w]
        if (score > threshold):
            
            # location of the w, h coordinate in the original image
            mx = w * cell_width;
            my = h * cell_height;
            
            # scale the detected coordinates to original and return their location in the image
            rectX1f = - (bboxes[batch_idx, 0, h, w] + box_offset) * box_scale + mx
            rectY1f = - (bboxes[batch_idx, 1, h, w] + box_offset) * box_scale + my
            rectX2f = (bboxes[batch_idx, 2, h, w] + box_offset) * box_scale + mx
            rectY2f = (bboxes[batch_idx, 3, h, w] + box_offset) * box_scale + my

            xmin = int(rectX1f)
            ymin = int(rectY1f)
            xmax = int(rectX2f)
            ymax = int(rectY2f)

            bboxes_list.append([xmin, ymin, xmax, ymax])
            scores_list.append(float(score))

Apply non-max suppression to the list of all detected bounding boxes.

In [None]:
indexes = cv2.dnn.NMSBoxes(bboxes_list, scores_list, threshold, 0.5)
print(indexes)

Show the detected bounding box in the original image.

In [None]:
img = cv2.imread(image_name)
for idx in indexes:
    idx = int(idx)
    xmin, ymin, xmax, ymax = bboxes_list[idx]
    xmin = int((xmin - (model_w - temp_width_cache)/2)/ratio)
    xmax = int((xmax - (model_w - temp_width_cache)/2)/ratio)
    ymin = int(ymin / ratio)
    ymax = int(ymax / ratio)
    color = [0, 255, 0]
    cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 4)

display_image(img)