In [1]:
import time, cv2, torch, os
import torch.backends.cudnn as cudnn
import numpy as np 
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms

from models.experimental import attempt_load
from utils.augmentations import letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords 
from utils.accessory_lib import system_info
from PIPNet.networks import Pip_resnet101
from PIPNet.functions import forward_pip, get_meanface
from scipy.spatial import distance
from tqdm import tqdm

In [2]:
bus_img_dir = "D:\\drowsy_dataset\\image\\bus\\"
passenger_img_dir = "D:\\drowsy_dataset\\image\\passenger\\"
taxi_img_dir = "D:\\drowsy_dataset\\image\\taxi\\"
truck_img_dir = "D:\\drowsy_dataset\\image\\truck\\"

In [3]:
bus_img_path_list = []

for sub_dir_name in os.listdir(bus_img_dir):
    for img_name in os.listdir(bus_img_dir+sub_dir_name):
        bus_img_path_list.append(bus_img_dir+sub_dir_name+os.sep+img_name)

bus_img_path_list.sort()
len(bus_img_path_list)

8067

In [4]:
passenger_img_path_list = []

for sub_dir_name in os.listdir(passenger_img_dir):
    for img_name in os.listdir(passenger_img_dir+sub_dir_name):
        passenger_img_path_list.append(passenger_img_dir+sub_dir_name+os.sep+img_name)

passenger_img_path_list.sort()
len(passenger_img_path_list)

8903

In [5]:
taxi_img_path_list = []

for sub_dir_name in os.listdir(taxi_img_dir):
    for img_name in os.listdir(taxi_img_dir+sub_dir_name):
        taxi_img_path_list.append(taxi_img_dir+sub_dir_name+os.sep+img_name)

taxi_img_path_list.sort()
len(taxi_img_path_list)

3200

In [6]:
truck_img_path_list = []

for sub_dir_name in os.listdir(truck_img_dir):
    for img_name in os.listdir(truck_img_dir+sub_dir_name):
        truck_img_path_list.append(truck_img_dir+sub_dir_name+os.sep+img_name)

truck_img_path_list.sort()
len(truck_img_path_list)

3344

In [7]:
net_stride = 32
num_nb = 10
data_name = 'data_300W'
experiment_name = 'pip_32_16_60_r101_l2_l1_10_1_nb10'
num_lms = 68
face_landmark_input_size = 240
det_box_scale = 1.2
eye_det = 0.12

img_size = 640
CONF_THRES = 0.2
IOU_THRES = 0.3
half = True
cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = False

prev_time = time.time()
start_time = time.time()

device = torch.device("cuda")

meanface_indices, reverse_index1, reverse_index2, max_len = get_meanface(os.path.join('PIPNet', 'data', data_name,
                                                                                      'meanface.txt'), num_nb)

resnet101 = models.resnet101(weights='ResNet101_Weights.DEFAULT')
face_landmark_net = Pip_resnet101(resnet101, num_nb=num_nb, num_lms=num_lms, input_size=face_landmark_input_size,
                                  net_stride=net_stride)
face_landmark_net = face_landmark_net.to(device)

face_landmark_weight_file_path = os.path.join('PIPNet', 'snapshots', data_name, experiment_name, f'epoch{60-1}.pth')
face_landmark_net.load_state_dict(torch.load(f=face_landmark_weight_file_path, map_location=device))
face_landmark_net.eval()

face_landmark_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
preprocess = transforms.Compose([transforms.ToPILImage(),
                                 transforms.Resize((face_landmark_input_size, face_landmark_input_size)),
                                 transforms.ToTensor(), face_landmark_normalize])

face_detector_weight_file_path = os.path.join('.', 'weights', 'face_detection_yolov5s.pt')
face_det_normalize_tensor = torch.tensor(255.0).to(device)

system_info()

# Initialize
print(f'[1/3] Device Initialized {time.time() - prev_time:.2f}sec')


 ----- Environment Status -----
Operating System Type: ('64bit', 'WindowsPE')
Python Version: 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]
Pytorch Version: 1.13.1+cu116
Open Computer Vision Version: 4.8.0
GPU Available: True
GPU Device Name: NVIDIA GeForce RTX 4090
Number of GPU: 1
Tensor Core TF32 Enable: False 

[1/3] Device Initialized 4.25sec


In [8]:
# Load model
prev_time = time.time()
model = attempt_load(face_detector_weight_file_path, device)  # load FP32 model
model.eval()
stride = int(model.stride.max())  # model stride
img_size_chk = check_img_size(img_size, s=stride)  # check img_size

if half:
    model.half()  # to FP16

# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names

# Run inference
model(torch.zeros(1, 3, img_size_chk, img_size_chk).to(device).type_as(next(model.parameters())))  # run once
print(f'[2/3] Yolov5 Detector Model Loaded {time.time() - prev_time:.2f}sec')

Fusing layers... 
newYOLOv5s summary: 224 layers, 7053910 parameters, 0 gradients


[2/3] Yolov5 Detector Model Loaded 3.47sec


In [9]:
predict_data_columns = ['file_name', 'face_pt1_x_pos', 'face_pt1_y_pos', 'face_pt2_x_pos', 'face_pt2_y_pos', 
                        'left_eye_pt1_x_pos', 'left_eye_pt1_y_pos', 'left_eye_pt2_x_pos', 'left_eye_pt2_y_pos',
                        'right_eye_pt1_x_pos', 'right_eye_pt1_y_pos', 'right_eye_pt2_x_pos', 'right_eye_pt2_y_pos',
                        'left_eye_horizontal', 'left_eye_vertical', 'right_eye_horizontal', 'right_eye_vertical',
                        'face_detection', 'left_eye_close', 'right_eye_close', 'result_close', 'face_confidence']

logging_data = pd.DataFrame()
logging_header = pd.DataFrame(columns=predict_data_columns)

start_time_str = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
logging_file_name = 'predict' + '_' + start_time_str
logging_file_path = 'analysis_data' + os.sep + logging_file_name + '.csv'
logging_header.to_csv(logging_file_path, mode='a', header=True)

In [10]:
face_bbox = np.zeros(4, dtype=np.float64)
left_eye_bbox = np.zeros(4, dtype=np.float64)
right_eye_bbox = np.zeros(4, dtype=np.float64)
eye_shape_arr = np.zeros(4, dtype=np.float64)  # left eye horizontal, left eye vertical, right eye horizontal, right eye vertical
detection_status = np.zeros(5, dtype=np.int32)  # face detection, left eye detection, right eye detection, result detection, face confidence

In [11]:
with (torch.no_grad()):
    for img_name in tqdm(truck_img_path_list, desc='predicting for eye detection'):
        face_bbox.fill(0)
        left_eye_bbox.fill(0)
        right_eye_bbox.fill(0)
        eye_shape_arr.fill(0)
        detection_status.fill(0)
                
        _, sample_file_name = os.path.split(img_name)
        img0 = cv2.imread(img_name) 
        img = letterbox(img0, img_size_chk, stride=stride)[0] # Padded resize

        # Convert
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
        img = np.ascontiguousarray(img)

        img = torch.from_numpy(img)
        img = img.half() if half else img.float()  # uint8 to fp16/32
        img = torch.divide(img.to(device), face_det_normalize_tensor)
        img = img.unsqueeze(0)
        
        pred = model(img, augment=False)[0]  # Inference
 
        detected_face = non_max_suppression(pred, CONF_THRES, IOU_THRES, classes=None, agnostic=False)[0]  # Apply NMS

        # Process detections
        if len(detected_face):
            detected_face[:, :4] = scale_coords(img.shape[2:], detected_face[:, :4], img0.shape).round()  # Rescale boxes from img_size to img0 size
            detection_status[0] = 1
            
            for *xyxy, conf, cls in reversed(detected_face):  # Write results
                face_det_conf = float(conf)
                detection_status[4] = int(face_det_conf*1000)
                det_xmin = int(xyxy[0])
                det_ymin = int(xyxy[1])
                det_xmax = int(xyxy[2])
                det_ymax = int(xyxy[3])
                
                det_width = det_xmax - det_xmin
                det_height = det_ymax - det_ymin
                
                for i, pos in enumerate(xyxy):
                    face_bbox[i] = float(pos)
                
                det_xmin -= int(det_width * (det_box_scale - 1) / 2)
                det_ymin -= int(det_height * (det_box_scale - 1) / 2)
                det_xmax += int(det_width * (det_box_scale - 1) / 2)
                det_ymax += int(det_height * (det_box_scale - 1) / 2)

                det_xmin = max(det_xmin, 0)
                det_ymin = max(det_ymin, 0)
                det_xmax = min(det_xmax, img0.shape[0] - 1)
                det_ymax = min(det_ymax, img0.shape[1] - 1)

                det_width = det_xmax - det_xmin + 1
                det_height = det_ymax - det_ymin + 1
                det_crop = img0[det_ymin:det_ymax, det_xmin:det_xmax, :]
                
                if det_crop.shape[0] > 0 and det_crop.shape[1] > 0:
                    det_crop = cv2.resize(src=det_crop, dsize=(face_landmark_input_size, face_landmark_input_size),
                                          interpolation=cv2.INTER_AREA)
                    cv2.rectangle(img0, (det_xmin, det_ymin), (det_xmax, det_ymax), (0, 0, 255), 1)

                    inputs = preprocess(det_crop).to(device)
                    inputs = inputs.unsqueeze(0)
                    lms_pred_x, lms_pred_y, lms_pred_nb_x, lms_pred_nb_y, outputs_cls, max_cls = forward_pip(face_landmark_net,
                                                                                                             inputs, preprocess,
                                                                                                             face_landmark_input_size,
                                                                                                             net_stride, num_nb)

                    lms_pred = torch.cat((lms_pred_x, lms_pred_y), dim=1).flatten()
                    tmp_nb_x = lms_pred_nb_x[reverse_index1, reverse_index2].view(num_lms, max_len)
                    tmp_nb_y = lms_pred_nb_y[reverse_index1, reverse_index2].view(num_lms, max_len)
                    tmp_x = torch.mean(torch.cat((lms_pred_x, tmp_nb_x), dim=1), dim=1).view(-1, 1)
                    tmp_y = torch.mean(torch.cat((lms_pred_y, tmp_nb_y), dim=1), dim=1).view(-1, 1)

                    lms_pred_merge = torch.cat((tmp_x, tmp_y), dim=1).flatten()
                    lms_pred = lms_pred.cpu().numpy()
                    lms_pred_merge = lms_pred_merge.cpu().numpy()

                    eye_x = (lms_pred_merge[36 * 2:48 * 2:2] * det_width).astype(np.int32) + det_xmin
                    eye_y = (lms_pred_merge[(36 * 2) + 1:(48 * 2) + 1:2] * det_height).astype(np.int32) + det_ymin

                    left_eye_horizontal_dist = distance.euclidean((eye_x[0], eye_y[0]), (eye_x[3], eye_y[3]))
                    left_eye_vertical_dist = min(distance.euclidean((eye_x[1], eye_y[1]), (eye_x[5], eye_y[5])),
                                                 distance.euclidean((eye_x[2], eye_y[2]), (eye_x[4], eye_y[4])))

                    right_eye_horizontal_dist = distance.euclidean((eye_x[6], eye_y[6]), (eye_x[9], eye_y[9]))
                    right_eye_vertical_dist = min(distance.euclidean((eye_x[11], eye_y[11]), (eye_x[7], eye_y[7])),
                                                  distance.euclidean((eye_x[10], eye_y[10]), (eye_x[8], eye_y[8])))

                    left_eye_ratio = left_eye_vertical_dist / left_eye_horizontal_dist
                    right_eye_ratio = right_eye_vertical_dist / right_eye_horizontal_dist
                    
                    eye_shape_arr[0] = left_eye_horizontal_dist
                    eye_shape_arr[1] = left_eye_vertical_dist
                    eye_shape_arr[2] = right_eye_horizontal_dist
                    eye_shape_arr[3] = right_eye_vertical_dist
                    
                    left_eye_det_result = left_eye_ratio < eye_det
                    right_eye_det_result = right_eye_ratio < eye_det
                    
                    drowsy_det = left_eye_det_result and right_eye_det_result
                    
                    detection_status[1] = int(left_eye_det_result)
                    detection_status[2] = int(right_eye_det_result)
                    detection_status[3] = int(drowsy_det)
                    
                    left_eye_bbox[0] = np.min(eye_x[0:5])
                    left_eye_bbox[1] = np.min(eye_y[0:5])
                    left_eye_bbox[2] = np.max(eye_x[0:5])
                    left_eye_bbox[3] = np.max(eye_y[0:5])

                    right_eye_bbox[0] = np.min(eye_x[6:11])
                    right_eye_bbox[1] = np.min(eye_y[6:11])
                    right_eye_bbox[2] = np.max(eye_x[6:11])
                    right_eye_bbox[3] = np.max(eye_y[6:11])
                    
        result_data = np.concatenate([np.array([sample_file_name]), face_bbox, left_eye_bbox, right_eye_bbox, eye_shape_arr, detection_status])
        logging_data = pd.DataFrame(data=[result_data], columns=predict_data_columns)
        logging_data.to_csv(logging_file_path, mode='a', header=False)

predicting for eye detection: 100%|██████████| 3344/3344 [02:51<00:00, 19.47it/s]
