### Real-time keypoint detection using webcam

In [2]:
import IPython.display
import torch
from io import BytesIO as StringIO
import PIL.Image
import time
from IPython.display import clear_output
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time

%matplotlib inline

In [3]:
from models import Net

img_size = 224

net = Net()

model_dir = "models"
model_name = "keypoints_model.pt"

net.load_state_dict(torch.load(os.path.join(model_dir, model_name)))

# print out net
net.eval()

Net(
  (conv1): Conv2d(1, 32, kernel_size=(4, 4), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.1, inplace=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (conv3): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout3): Dropout(p=0.3, inplace=False)
  (conv4): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
  (bn4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 

In [4]:
# loading a haar cascade classifier for detecting frontal faces
face_cascade = cv2.CascadeClassifier(
    "detector_architectures/haarcascade_frontalface_default.xml"
)

In [5]:
def detect_faces(img):

    faces = face_cascade.detectMultiScale(img, 1.2, 2)

    # make a copy of the original image to plot detections on
    image_with_detections = img.copy()

    for (x, y, w, h) in faces:
        # draw a rectangle around each detected face
        # we also need to change the width of the rectangle drawn depending on image resolution
        cv2.rectangle(image_with_detections, (x, y), (x + w, y + h), (255, 0, 0), 3)

    return image_with_detections

In [6]:
# Using 'jpeg' instead of 'png' (~5 times faster)
def showarray(a, fmt="jpeg"):
    f = StringIO()
    PIL.Image.fromarray(a).save(f, fmt)
    IPython.display.display(IPython.display.Image(data=f.getvalue()))

In [7]:
def get_frame(cam):
    # Capture frame-by-frame
    ret, frame = cam.read()

    # flip image for natural viewing
    frame = cv2.flip(frame, 1)

    return frame

In [8]:
def detect_keypoints(img, scale):

    faces = face_cascade.detectMultiScale(img, 1.2, 2)

    if len(faces) == 0:
        return img

    image_copy = np.copy(img)

    # loop over the detected faces from the haar cascade
    for (x, y, w, h) in faces:

        # don't scale ouside of the frame!
        if (y - scale) < 0 and (x - scale) < 0:
            if (y - scale) < (x - scale):
                scale += y - scale
            else:
                scale += x - scale
        elif (y - scale) < 0:
            scale += y - scale
        elif (x - scale) < 0:
            scale += x - scale

        # Select the region of interest that is the face in the image
        roi = image_copy[y - scale : y + h + scale, x - scale : x + w + scale]

        roi_color = np.copy(roi)

        # Convert the face region from RGB to grayscale
        roi = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)
        # Normalize the grayscale image so that its color range falls in [0,1] instead of [0,255]
        roi = roi / 255.0
        # Rescale the detected face to be the expected square size for the CNN (224x224, suggested)

        h, w = roi.shape

        shape_before_resize = roi.shape

        roi = cv2.resize(roi, (img_size, img_size))

        shape_after_resize = roi.shape

        # how much the image was scaled with
        # will use to resize and fit to the webcam image
        scaling_factor = shape_before_resize[0] / shape_after_resize[0]

        roi_color = cv2.resize(roi_color, (img_size, img_size))
        # Make copy for displaying keypoint over
        roi_copy = np.copy(roi)

        # Reshape the numpy image shape (H x W x C) into a torch image shape (C x H x W)

        # if image has no grayscale color channel, add one
        if len(roi.shape) == 2:
            # add that third color dim
            roi = roi.reshape(roi.shape[0], roi.shape[1], 1)

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        roi = roi.transpose((2, 0, 1))

        roi = torch.from_numpy(roi)
        roi = roi.type(torch.FloatTensor)

        roi.unsqueeze_(0)
        # Make facial keypoint predictions using the loaded, trained network
        # perform a forward pass to get the predicted facial keypoints
        output_pts = net(roi)
        # reshape to size x 68 x 2 pts
        output_pts = output_pts.view(68, -1)

        # undo normalization of keypoints
        output_pts = output_pts.detach().numpy()
        output_pts = output_pts * (roi_copy.shape[0] / 4) + roi_copy.shape[0] / 2
        for pts in output_pts:
            pts[0] = x - scale + pts[0] * scaling_factor
            pts[1] = y - scale + pts[1] * scaling_factor

        # Draw mask
        color = (0, 255, 0)
        for i in range(len(output_pts)):

            if i != 16 and i != 21 and i != 26 and i != 30 and i != 35 and i < 68:
                pt1 = (int(output_pts[i][0]), int(output_pts[i][1]))

                if i == 17:
                    # left eyebrow
                    color = (0, 100, 0)
                elif i == 22:
                    # right eyebrow
                    color = (0, 100, 0)
                elif i == 27:
                    # nose stem
                    color = (255, 255, 0)
                elif i == 31:
                    # nose tip
                    color = (255, 255, 0)
                elif i == 36:
                    # left eye
                    color = (0, 250, 154)
                elif i == 42:
                    # right eye
                    color = (0, 250, 154)
                elif i == 48:
                    # lips
                    color = (255, 20, 147)

                if i == 41:
                    pt2 = (int(output_pts[36][0]), int(output_pts[36][1]))
                elif i == 47:
                    pt2 = (int(output_pts[42][0]), int(output_pts[42][1]))
                elif i == 67:
                    pt2 = (int(output_pts[60][0]), int(output_pts[60][1]))
                else:
                    pt2 = (int(output_pts[i + 1][0]), int(output_pts[i + 1][1]))

                cv2.line(image_copy, pt1, pt2, color, thickness=5, lineType=8, shift=0)

        return image_copy

In [9]:
def draw_raw_keypoints(img, scale):

    faces = face_cascade.detectMultiScale(img, 1.2, 2)

    if len(faces) == 0:
        return img

    else:
        image_copy = np.copy(img)

        # loop over the detected faces from the haar cascade
        for i, (x, y, w, h) in enumerate(faces):
            cv2.rectangle(image_copy, (x, y), (x + w, y + h), (0, 255, 0), 3)

            # Select the region of interest that is the face in the image
            # roi = image_copy[y:y+h, x:x+w]
            padding = 20
            roi = image_copy[
                y - padding : y + h + padding, x - padding : x + w + padding
            ]

            shape_before_resize = roi.shape

            # Convert the face region from RGB to grayscale
            roi = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)

            # Normalize the grayscale image so that its color range
            # falls in [0,1] instead of [0,255]
            roi = roi / 255.0

            # Rescale the detected face to be the expected square size for the CNN (224x224, suggested)
            roi = cv2.resize(roi, (224, 224))

            shape_after_resize = roi.shape

            # Reshape the numpy image shape (H x W x C) into a torch image shape (C x H x W)
            torch_roi = roi.reshape(1, roi.shape[0], roi.shape[1], 1)
            # (batch_size, channel, H, W)
            torch_roi = torch_roi.transpose((0, 3, 1, 2))

            # Make facial keypoint predictions using the loaded, trained network
            # wrap each face region in a Variable and
            # perform a forward pass to get the predicted facial keypoints
            torch_roi = torch.from_numpy(torch_roi)

            # convert images to FloatTensors
            torch_roi = torch_roi.type(torch.FloatTensor)

            # Make facial keypoint predictions using the loaded, trained network
            output_pts = net(torch_roi)

            # Display each detected face and the corresponding keypoints

            # un-transform the predicted key_pts data
            predicted_key_pts = output_pts.data
            predicted_key_pts = predicted_key_pts.numpy()

            # reshape to 68 x 2 pts
            predicted_key_pts = predicted_key_pts[0].reshape((68, 2))

            # undo normalization of keypoints
            output_pts = predicted_key_pts * 50.0 + 100.0

            # how much the image was scaled with
            # will use to resize and fit to the webcam image
            scaling_factor_width = shape_before_resize[0] / shape_after_resize[0]
            scaling_factor_height = shape_before_resize[1] / shape_after_resize[1]

            for pts in output_pts:
                pts[0] = x - scale + pts[0] * scaling_factor_width
                pts[1] = y - scale + pts[1] * scaling_factor_height

            for item in output_pts:
                cv2.drawMarker(
                    image_copy,
                    (int(item[0]), int(item[1])),
                    (255, 0, 0),
                    markerSize=5,
                    markerType=cv2.MARKER_CROSS,
                )

            return image_copy

### Draw funny shapes on detected keypoints

In [8]:
cap = cv2.VideoCapture(0)
save_video = False

if save_video:
    # https://www.learnopencv.com/read-write-and-display-a-video-using-opencv-cpp-python/
    # Default resolutions of the frame are obtained.The default resolutions are system dependent.
    # We convert the resolutions from float to integer.
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    # Define the codec and create VideoWriter object.The output is stored in 'outpput.avi' file.
    out = cv2.VideoWriter(
        "output.avi",
        cv2.VideoWriter_fourcc("M", "J", "P", "G"),
        4,
        (frame_width, frame_height),
    )

try:
    while True:
        # start_time = time.time()

        # Capture frame-by-frame
        frame = get_frame(cap)

        # Convert the image from OpenCV BGR format to matplotlib RGB format
        # to display the image
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        frame = detect_keypoints(frame, 30)

        # write to video file
        if save_video:
            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        showarray(frame)

        clear_output(wait=True)

        # print("FPS: ", 1.0 / (time.time() - start_time)) # FPS = 1 / time to process loop


except KeyboardInterrupt:
    cap.release()
    if save_video:
        out.release()
    print("Stream stopped")

Stream stopped


### Draw raw keypoints

In [10]:
cap = cv2.VideoCapture(0)
save_video = True

if save_video:
    # https://www.learnopencv.com/read-write-and-display-a-video-using-opencv-cpp-python/
    # Default resolutions of the frame are obtained.The default resolutions are system dependent.
    # We convert the resolutions from float to integer.
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    # Define the codec and create VideoWriter object.The output is stored in 'outpput.avi' file.
    out = cv2.VideoWriter(
        "output.avi",
        cv2.VideoWriter_fourcc("M", "J", "P", "G"),
        4,
        (frame_width, frame_height),
    )
try:
    while True:
        # start_time = time.time()

        # Capture frame-by-frame
        frame = get_frame(cap)

        # Convert the image from OpenCV BGR format to matplotlib RGB format
        # to display the image
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        frame = draw_raw_keypoints(frame, 30)

        # write to video file
        if save_video:
            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        showarray(frame)

        clear_output(wait=True)
        # print("FPS: ", 1.0 / (time.time() - start_time)) # FPS = 1 / time to process loop


except KeyboardInterrupt:
    cap.release()
    if save_video:
        out.release()
    print("Stream stopped")

Stream stopped


The wabcam code is based on the [displaying webcam video in IPython notebook](https://github.com/ktaletsk/NCCV/blob/master/Realtime_video_ipython.ipynb) project.

In [54]:
print(frame_width, frame_height)

640 480
