In [1]:
import mediapipe as mp
import cv2
import numpy as np
import math
from sklearn.preprocessing import Normalizer

In [2]:
#calibrate camera for accurate results
def calibrate(frame, model_points, image_points):
    focal_length = frame.shape[1]  #width
    center = (frame.shape[1]/ 2, frame.shape[0]/ 2)
    camera_matrix = np.array([[focal_length, 0, center[0]],
                     [0, focal_length, center[1]], [0, 0, 1]], dtype= 'double')
    dist_coeffs = np.zeros((4, 1))
    (success, rotation_vector, translation_vector) = cv2.solvePnP(model_points, image_points, camera_matrix,
                                                        dist_coeffs, flags= cv2.SOLVEPNP_ITERATIVE)
    return camera_matrix, dist_coeffs, success, rotation_vector, translation_vector

In [3]:
relative= lambda landmark, shape: (int(landmark.x* shape[1]), int(landmark.y* shape[0]))
relativeT= lambda landmark, shape: (int(landmark.x* shape[1]), int(landmark.y* shape[0]), 0)
mp_face_mesh = mp.solutions.face_mesh

In [4]:
LEFT_EYE= [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
RIGHT_EYE= [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161 , 246]

In [5]:
def gaze(frame, points):
    #normalized to [-1, 1], format (x, y)
    image_points = np.array([
        relative(points.landmark[4], frame.shape),     #nose tip
        relative(points.landmark[152], frame.shape),   #chin
        relative(points.landmark[263], frame.shape),   #left eye left corner
        relative(points.landmark[33], frame.shape),    #right eye right corner
        relative(points.landmark[287], frame.shape),   #left mouth corner
        relative(points.landmark[57], frame.shape)],   #right mouth corner
         dtype= 'double')

    #normalized to [-1, 1], format (x, y, 0)
    image_points1 = np.array([
        relativeT(points.landmark[4], frame.shape),
        relativeT(points.landmark[152], frame.shape),
        relativeT(points.landmark[263], frame.shape),
        relativeT(points.landmark[33], frame.shape),
        relativeT(points.landmark[287], frame.shape),
        relativeT(points.landmark[57], frame.shape)],
         dtype= 'double')
    
    #3D model points
    model_points = np.array([
        (0.0, 0.0, 0.0),         #nose tip
        (0, -63.6, -12.5),       #chin
        (-43.3, 32.7, -26),      #left eye left corner
        (43.3, 32.7, -26),       #right eye right corner
        (-28.9, -28.9, -24.1),   #left mouth corner
        (28.9, -28.9, -24.1)])   #right mouth corner
    
    #3d model eye points- center of the eye ball
    Eye_ball_center_right = np.array([[-29.05],[32.7],[-39.5]])
    Eye_ball_center_left = np.array([[29.05],[32.7],[-39.5]])
    
    camera_matrix, dist_coeffs, success, rotation_vector, translation_vector= calibrate(frame, model_points, image_points)
    
    left_pupil = relative(points.landmark[468], frame.shape)
    right_pupil = relative(points.landmark[473], frame.shape)
    
    _, transformation, _ = cv2.estimateAffine3D(image_points1, model_points)

    if transformation is not None:
        #project pupil image point into 3d world point 
        pupil_world_cord = transformation@ np.array([[left_pupil[0], left_pupil[1], 0, 1]]).T
        pupil_world_right= transformation@ np.array([[right_pupil[0], right_pupil[1], 0, 1]]).T

        #3D gaze point (25 is arbitrary for gaze distance)
        S = Eye_ball_center_left+ (pupil_world_cord- Eye_ball_center_left)* 25
        S_right= Eye_ball_center_right+ (pupil_world_right- Eye_ball_center_right)* 25

        #project a 3D gaze direction onto the image plane
        (eye_pupil2D, _) = cv2.projectPoints((int(S[0]), int(S[1]), int(S[2])), rotation_vector,
                                             translation_vector, camera_matrix, dist_coeffs)
        (eye_pupil2D_right, _)= cv2.projectPoints((int(S_right[0]), int(S_right[1]), int(S_right[2])), rotation_vector,
                                                 translation_vector, camera_matrix, dist_coeffs)
        
        #project 3D head pose into the image plane
        (head_pose, _) = cv2.projectPoints((int(pupil_world_cord[0]), int(pupil_world_cord[1]), int(65)),
                                           rotation_vector, translation_vector, camera_matrix, dist_coeffs)
        (head_right, _)= cv2.projectPoints((int(pupil_world_right[0]), int(pupil_world_right[1]), int(65)),
                                            rotation_vector, translation_vector, camera_matrix, dist_coeffs)
        
        #correct gaze for head rotation
        gaze_left= left_pupil+ (eye_pupil2D[0][0]- left_pupil)- (head_pose[0][0]- left_pupil)
        gaze_right= right_pupil+ (eye_pupil2D_right[0][0]- right_pupil)- (head_right[0][0]- right_pupil)

        #draw gaze line into screen
        PUPIL= [[int(left_pupil[0]), int(left_pupil[1])], [int(right_pupil[0]), int(right_pupil[1])]]
        GAZE= [[int(gaze_left[0]), int(gaze_left[1])], [int(gaze_right[0]), int(gaze_right[1])]]
        cv2.line(frame, PUPIL[0], GAZE[0], (255, 255, 255), 4)
        cv2.line(frame, PUPIL[1], GAZE[1], (255, 255, 255), 4)
        
        transformer = Normalizer().fit(PUPIL)
        PUPIL= transformer.transform(PUPIL)
        GAZE= transformer.transform(GAZE)
        #print('Pupil'+ str(PUPIL)+ '\n'+ 'Gaze'+ str(GAZE)+ '\n')

In [6]:
def euclideanDist(point1, point2):
    x1, y1= point1
    x2, y2= point2
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def blinkRatio(image, landmarks, right_indices, left_indices):
    #right eye
    rh_right = landmarks[right_indices[0]]
    rh_left = landmarks[right_indices[8]]
    rv_top = landmarks[right_indices[12]]
    rv_bottom = landmarks[right_indices[4]]
    cv2.line(image, rh_right, rh_left, (0, 0, 0), 2)
    cv2.line(image, rv_top, rv_bottom, (0, 0, 0), 2)

    #left eye
    lh_right = landmarks[left_indices[0]]
    lh_left = landmarks[left_indices[8]]
    lv_top = landmarks[left_indices[12]]
    lv_bottom = landmarks[left_indices[4]]
    cv2.line(image, lh_right, lh_left, (0, 0, 0), 2)
    cv2.line(image, lv_top, lv_bottom, (0, 0, 0), 2)
    
    #find distance
    rhDist= euclideanDist(rh_right, rh_left)
    rvDist= euclideanDist(rv_top, rv_bottom)
    lhDist= euclideanDist(lh_right, lh_left)
    lvDist= euclideanDist(lv_top, lv_bottom)

    if not rvDist or not lvDist:
        return True
    rightRatio = rhDist/ rvDist
    leftRatio = lhDist/ lvDist
    ratio= (rightRatio+ leftRatio)/ 2
    return ratio> 5.5

In [7]:
def landmarksDetection(image, results):
    height, width= image.shape[: 2]
    mesh_coord= [(int(point.x* width), int(point.y* height)) for point in results.multi_face_landmarks[0].landmark]
    return mesh_coord

In [8]:
def head_track(image, results):
    img_h, img_w, img_c= image.shape
    face_2d= []
    face_3d= []
    for face_landmarks in results.multi_face_landmarks:
        for idx, lm in enumerate(face_landmarks.landmark):
            if idx== 33 or idx== 263 or idx== 1 or idx== 61 or idx== 291 or idx== 199:
                if idx== 1:
                    nose_2d= (lm.x* img_w, lm.y * img_h)
                    nose_3d= (lm.x * img_w, lm.y * img_h, lm.z * 3000)
                x, y= int(lm.x* img_w), int(lm.y* img_h)
                face_2d.append([x, y])
                face_3d.append([x, y, lm.z])
        
        face_2d = np.array(face_2d, dtype=np.float64)
        face_3d = np.array(face_3d, dtype=np.float64)
        cam_matrix, dist_matrix, success, rot_vec, trans_vec= calibrate(image, face_3d, face_2d)
        rmat, jac = cv2.Rodrigues(rot_vec)
        angles, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rmat)
        x = angles[0] * 360
        y = angles[1] * 360
        z = angles[2] * 360
        if y< -10:
            text= "Looking Left"
        elif y> 10:
            text= "Looking Right"
        elif x< -10:
            text= "Looking Down"
        elif x> 10:
            text= "Looking Up"
        else:
            text= "Forward"
        nose_3d_projection, jacobian = cv2.projectPoints(nose_3d, rot_vec, trans_vec, cam_matrix, dist_matrix)
        point1= (int(nose_2d[0]), int(nose_2d[1]))
        point2= (int(nose_2d[0] + y * 10) , int(nose_2d[1] - x * 10))
        cv2.line(image, point1, point2, (255, 255, 255), 3)
        cv2.putText(image, text, [100, 140], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

In [None]:
cap = cv2.VideoCapture(1)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
CCFRAME= 0
frame_count= 0

with mp_face_mesh.FaceMesh(max_num_faces=1, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5) as face:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        CCFRAME+= 1
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = face.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        if results.multi_face_landmarks:
            head_track(image, results)
            frame_count+= 1
            coord= landmarksDetection(image, results)
            if blinkRatio(image, coord, RIGHT_EYE, LEFT_EYE):
                cv2.putText(image, 'Closed Eyes', [100, 100], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            else:
                gaze(image, results.multi_face_landmarks[0])

        cv2.imshow('Output Window', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):          
            break

cap.release()
cv2.destroyAllWindows()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
