In [171]:
import time, cv2, torch, os
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision.models as models
import torchvision.transforms as transforms

from models.experimental import attempt_load
from utils.augmentations import letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords
from utils.accessory_lib import system_info
from PIPNet.networks import Pip_resnet101
from PIPNet.functions import forward_pip, get_meanface
from scipy.spatial import distance
from tqdm import tqdm

In [172]:
net_stride = 32
num_nb = 10
data_name = 'data_300W'
experiment_name = 'pip_32_16_60_r101_l2_l1_10_1_nb10'
num_lms = 68
face_landmark_input_size = 240
det_box_scale = 1.2
eye_det = 0.11

img_size = 640
CONF_THRES = 0.4
IOU_THRES = 0.45
cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = False

elapsed_time: float = 0.0
fps: float = 0.0
ref_frame: int = 0
det_frame: int = 0
prev_time = time.time()
start_time = time.time()
t0 = time.time()

left_eye_det_prv = False
right_eye_det_prv = False
left_eye_det_result = False
right_eye_det_result = False
left_eye_ratio: float = 0.0
right_eye_ratio: float = 0.0
drowsy_det = False

In [173]:
img_dir = "C:\\workspace\\drowsy_image\\bus\\R_217_40_M"

img_path_list = []

for img_name in os.listdir(img_dir):
    img_path_list.append(img_dir+os.sep+img_name)

img_path_list.sort()
len(img_path_list)

400

In [174]:
device = torch.device("cuda")

meanface_indices, reverse_index1, reverse_index2, max_len = get_meanface(os.path.join('PIPNet', 'data', data_name,
                                                                                      'meanface.txt'), num_nb)

resnet101 = models.resnet101(weights='ResNet101_Weights.DEFAULT')
face_landmark_net = Pip_resnet101(resnet101, num_nb=num_nb, num_lms=num_lms, input_size=face_landmark_input_size,
                                  net_stride=net_stride)

face_landmark_net = face_landmark_net.to(device)

face_landmark_weight_file_path = os.path.join('PIPNet', 'snapshots', data_name, experiment_name, f'epoch{60-1}.pth')
face_landmark_net.load_state_dict(torch.load(f=face_landmark_weight_file_path, map_location=device))
face_landmark_net.eval()

face_landmark_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
preprocess = transforms.Compose([transforms.ToPILImage(),
                                 transforms.Resize((face_landmark_input_size, face_landmark_input_size)),
                                 transforms.ToTensor(), face_landmark_normalize])

face_detector_weight_file_path = os.path.join('.', 'weights', 'face_detection_yolov5s.pt')
face_det_normalize_tensor = torch.tensor(255.0).to(device)

system_info()

# Initialize
print(f'[1/3] Device Initialized {time.time() - prev_time:.2f}sec')
prev_time = time.time()

# Load model
model = attempt_load(face_detector_weight_file_path, device)  # load FP32 model
model.eval()
stride = int(model.stride.max())  # model stride
img_size_chk = check_img_size(img_size, s=stride)  # check img_size

# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names

# Run inference
model(torch.zeros(1, 3, img_size_chk, img_size_chk).to(device).type_as(next(model.parameters())))  # run once
print(f'[2/3] Yolov5 Detector Model Loaded {time.time() - prev_time:.2f}sec')
prev_time = time.time()

Fusing layers... 
newYOLOv5s summary: 224 layers, 7053910 parameters, 0 gradients



 ----- Environment Status -----
Operating System Type: ('64bit', 'WindowsPE')
Python Version: 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]
Pytorch Version: 1.13.1+cu116
Open Computer Vision Version: 4.9.0
GPU Available: True
GPU Device Name: NVIDIA GeForce RTX 3080 Ti Laptop GPU
Number of GPU: 1
Tensor Core TF32 Enable: False 

[1/3] Device Initialized 1.02sec
[2/3] Yolov5 Detector Model Loaded 0.11sec


In [175]:
img = cv2.imread(img_path_list[0])
fps = 5
fourcc = cv2.VideoWriter_fourcc(*'MP4v')
writer = cv2.VideoWriter('result_video.mp4', fourcc, fps, (img.shape[1], img.shape[0]))

In [176]:
for img_name in tqdm(img_path_list[0:200]):
    img0 = cv2.imread(img_name)
    img = letterbox(img0, img_size_chk, stride=stride)[0]  # Padded resize

    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img)
    img = torch.divide(img.to(device), face_det_normalize_tensor)
    img = img.unsqueeze(0)

    # Inference
    pred = model(img, augment=False)[0]

    # Apply NMS
    detected_face = non_max_suppression(pred, CONF_THRES, IOU_THRES, classes=None, agnostic=False)[0]

    # Process detections
    if len(detected_face):
        # Rescale boxes from img_size to img0 size
        detected_face[:, :4] = scale_coords(img.shape[2:], detected_face[:, :4], img0.shape).round()

        # Write results
        for *xyxy, conf, cls in reversed(detected_face):
            det_xmin = int(xyxy[0])
            det_ymin = int(xyxy[1])
            det_xmax = int(xyxy[2])
            det_ymax = int(xyxy[3])
            det_width = det_xmax - det_xmin
            det_height = det_ymax - det_ymin

            det_xmin -= int(det_width * (det_box_scale - 1) / 2)
            det_ymin -= int(det_height * (det_box_scale - 1) / 2)
            det_xmax += int(det_width * (det_box_scale - 1) / 2)
            det_ymax += int(det_height * (det_box_scale - 1) / 2)

            det_xmin = max(det_xmin, 0)
            det_ymin = max(det_ymin, 0) 
            det_xmax = min(det_xmax, img0.shape[1] - 1)
            det_ymax = min(det_ymax, img0.shape[0] - 1)
            
            det_width = det_xmax - det_xmin + 1
            det_height = det_ymax - det_ymin + 1
            det_crop = img0[det_ymin:det_ymax, det_xmin:det_xmax, :]
            det_crop = cv2.resize(src=det_crop, dsize=(face_landmark_input_size, face_landmark_input_size),
                                  interpolation=cv2.INTER_AREA)
            cv2.rectangle(img0, (det_xmin, det_ymin), (det_xmax, det_ymax), (0, 0, 255), 1)

            inputs = preprocess(det_crop).to(device)
            inputs = inputs.unsqueeze(0)
            lms_pred_x, lms_pred_y, lms_pred_nb_x, lms_pred_nb_y, outputs_cls, max_cls = forward_pip(face_landmark_net,
                                                                                                     inputs, preprocess,
                                                                                                     face_landmark_input_size,
                                                                                                     net_stride, num_nb)

            # lms_pred = torch.cat((lms_pred_x, lms_pred_y), dim=1).flatten()
            tmp_nb_x = lms_pred_nb_x[reverse_index1, reverse_index2].view(num_lms, max_len)
            tmp_nb_y = lms_pred_nb_y[reverse_index1, reverse_index2].view(num_lms, max_len)
            tmp_x = torch.mean(torch.cat((lms_pred_x, tmp_nb_x), dim=1), dim=1).view(-1, 1)
            tmp_y = torch.mean(torch.cat((lms_pred_y, tmp_nb_y), dim=1), dim=1).view(-1, 1)

            lms_pred_merge = torch.cat((tmp_x, tmp_y), dim=1).flatten()
            # lms_pred = lms_pred.cpu().numpy()
            lms_pred_merge = lms_pred_merge.cpu().numpy()

            eye_x = (lms_pred_merge[36 * 2:48 * 2:2] * det_width).astype(np.int32) + det_xmin
            eye_y = (lms_pred_merge[(36 * 2) + 1:(48 * 2) + 1:2] * det_height).astype(np.int32) + det_ymin

            left_eye_horizontal_dist = distance.euclidean((eye_x[0], eye_y[0]), (eye_x[3], eye_y[3]))
            left_eye_vertical_dist = min(distance.euclidean((eye_x[1], eye_y[1]), (eye_x[5], eye_y[5])),
                                         distance.euclidean((eye_x[2], eye_y[2]), (eye_x[4], eye_y[4])))

            right_eye_horizontal_dist = distance.euclidean((eye_x[6], eye_y[6]), (eye_x[9], eye_y[9]))
            right_eye_vertical_dist = min(distance.euclidean((eye_x[11], eye_y[11]), (eye_x[7], eye_y[7])),
                                          distance.euclidean((eye_x[10], eye_y[10]), (eye_x[8], eye_y[8])))

            left_eye_ratio = left_eye_vertical_dist / left_eye_horizontal_dist
            right_eye_ratio = right_eye_vertical_dist / right_eye_horizontal_dist

            left_eye_det_result = 0.01 < left_eye_ratio < eye_det
            right_eye_det_result = 0.01 < right_eye_ratio < eye_det


            left_eye_color = (0, 0, 255) if left_eye_det_result else (0, 255, 0)
            right_eye_color = (0, 0, 255) if right_eye_det_result else (0, 255, 0)

            drowsy_det = left_eye_det_result and right_eye_det_result

            if drowsy_det:
                det_frame += 1

            for i in range(len(eye_x)):
                if i <= 5:
                    cv2.circle(img0, (eye_x[i], eye_y[i]), 1, left_eye_color, 1)
                else:
                    cv2.circle(img0, (eye_x[i], eye_y[i]), 1, right_eye_color, 1)
                            
    writer.write(img0)

writer.release()

100%|██████████| 200/200 [00:06<00:00, 32.03it/s]
