In [None]:
!git clone https://github.com/xucong-zhang/data-preprocessing-gaze.git

fatal: destination path 'data-preprocessing-gaze' already exists and is not an empty directory.


In [None]:
from sys import setdlopenflags
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.models import efficientnet

class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()

        efficientnet = torchvision.models.efficientnet_b0( weights='DEFAULT')
        self.convNet = efficientnet.features
        self.convNet[0]=nn.Conv2d(1,32,3,stride=2)
        self.FC = nn.Sequential(
            nn.Linear(5120, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5)
        )

        self.output = nn.Sequential(
            nn.Linear(1024+2, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(1024, 2),
        )
    def forward(self, x_in):
        feature = self.convNet(x_in['eye'])
        feature = torch.flatten(feature, start_dim=1)
        feature = self.FC(feature)
        feature = torch.cat((feature, x_in['head_pose']), 1)
        gaze = self.output(feature)
        return gaze

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
                nn.init.zeros_(m.bias)
    def load_model(self,path,device):
        self.to(device)
        self.load_state_dict(torch.load(path,map_location=device))
        self.eval()

In [None]:
import os
import cv2
import numpy as np
import scipy.io as sio
import dlib
predictor_path = "shape_predictor_68_face_landmarks.dat/shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)  # Replace with path to shape predictor
def get_facial_landmarks(image, detector, predictor):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = detector(gray)

    if len(faces) == 0:
        return None, None

    # Use the largest face if multiple faces are detected
    face = max(faces, key=lambda rect: rect.width() * rect.height())

    landmarks = predictor(gray, face)
    points = np.array([[landmarks.part(n).x, landmarks.part(n).y] for n in range(68)])

    # Extract the 2D coordinates of specific landmarks (eyes and mouth corners)
    keypoints = points[[36, 39, 42, 45,48,54]]  # Right eye corner, left eye corner, nose tip, right mouth, left mouth, nose bridge
    return keypoints, points

def draw_gaze(image_in, pitchyaw, thickness=2, color=(0, 0, 255)):
    """Draw gaze angle on given image with a given eye positions."""
    image_out = image_in
    (h, w) = image_in.shape[:2]
    length = np.min([h, w]) / 2.0
    pos = (int(w / 2.0), int(h / 2.0))
    if len(image_out.shape) == 2 or image_out.shape[2] == 1:  # Convert to RGB if grayscale
        image_out = cv2.cvtColor(image_out, cv2.COLOR_GRAY2BGR)
    dx = -length * np.sin(pitchyaw[1]) * np.cos(pitchyaw[0])
    dy = -length * np.sin(pitchyaw[0])
    #視線を書く
    cv2.arrowedLine(image_out, tuple(np.round(pos).astype(int)),
                  tuple(np.round([pos[0] + dx, pos[1] + dy]).astype(int)), color,
                 thickness, cv2.LINE_AA, tipLength=0.2)

    return image_out

def estimateHeadPose(landmarks, face_model, camera, distortion, iterate=True):
    ret, rvec, tvec = cv2.solvePnP(face_model, landmarks, camera, distortion, flags=cv2.SOLVEPNP_EPNP)

    if iterate:
        ret, rvec, tvec = cv2.solvePnP(face_model, landmarks, camera, distortion, rvec, tvec, True)

    return rvec, tvec

def normalizeData(img, face, hr, ht, gc, cam):
    focal_norm = 960
    distance_norm = 600
    roiSize = (60, 36)

    img_u = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    ht = ht.reshape((3, 1))
    gc = gc.reshape((3, 1))
    hR = cv2.Rodrigues(hr)[0]
    Fc = np.dot(hR, face) + ht
    re = 0.5 * (Fc[:, 0] + Fc[:, 1]).reshape((3, 1))
    le = 0.5 * (Fc[:, 2] + Fc[:, 3]).reshape((3, 1))

    data = []
    for et in [re, le]:
        distance = np.linalg.norm(et)

        z_scale = distance_norm / distance
        cam_norm = np.array([
            [focal_norm, 0, roiSize[0] / 2],
            [0, focal_norm, roiSize[1] / 2],
            [0, 0, 1.0],
        ])
        S = np.array([
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
            [0.0, 0.0, z_scale],
        ])

        hRx = hR[:, 0]
        forward = (et / distance).reshape(3)
        down = np.cross(forward, hRx)
        down /= np.linalg.norm(down)
        right = np.cross(down, forward)
        right /= np.linalg.norm(right)
        R = np.c_[right, down, forward].T

        W = np.dot(np.dot(cam_norm, S), np.dot(R, np.linalg.inv(cam)))

        img_warped = cv2.warpPerspective(img_u, W, roiSize)
        img_warped = cv2.equalizeHist(img_warped)

        hR_norm = np.dot(R, hR)
        hr_norm = cv2.Rodrigues(hR_norm)[0]

        gc_normalized = gc - et
        gc_normalized = np.dot(R, gc_normalized)
        gc_normalized = gc_normalized / np.linalg.norm(gc_normalized)

        data.append([img_warped, hr_norm, gc_normalized])

    return data

def fetch_eyes(path):
    fid = cv2.FileStorage('data-preprocessing-gaze/data/calibration/cameraCalib.xml', cv2.FileStorage_READ)
    camera_matrix = fid.getNode("camera_matrix").mat()
    camera_distortion = fid.getNode("cam_distortion").mat()
    print(camera_matrix,camera_distortion)
    filepath = os.path.join(path)
    img_original = cv2.imread(filepath)
    img = cv2.undistort(img_original, camera_matrix, camera_distortion)

    # Assuming detector and predictor have been loaded from Dlib
    landmarks, general_landmarks = get_facial_landmarks(img, detector, predictor)

    face = np.loadtxt('data-preprocessing-gaze/data/faceModelGeneric.txt')
    num_pts = face.shape[1]
    facePts = face.T.reshape(num_pts, 1, 3)
    print(facePts)
    landmarks = landmarks.astype(np.float32)
    landmarks = landmarks.reshape(num_pts, 1, 2)
    hr, ht = estimateHeadPose(landmarks, facePts, camera_matrix, camera_distortion)

    gc = np.array([-127.790719, 4.621111, -12.025310])

    data = normalizeData(img, face, hr, ht, gc, camera_matrix)

    gaze_left = data[1][2]
    gaze_right = data[0][2]
    lr = ["right", "left"]

    def write_normalized(num):
        gaze_direction = data[num][2]
        gaze_theta = np.arcsin((-1) * gaze_direction[1])
        gaze_phi = np.arctan2((-1) * gaze_direction[0], (-1) * gaze_direction[2])

        img_normalized = data[num][0]
        cv2.imwrite(f'img_normalized_{lr[num]}({os.path.basename(path)}).jpg', img_normalized)
        # 視線を描く
        img_normalized = draw_gaze(img_normalized, np.array([gaze_theta[0], gaze_phi[0]]))

    write_normalized(0)
    write_normalized(1)

    # 両目の中心を計算
    right_eye_center = np.mean([facePts[0],facePts[0],facePts[0],facePts[1]],axis=0)[0]
    left_eye_center = np.mean([facePts[1],facePts[1],facePts[1],facePts[0]],axis=0)[0]
    print("Right eye center:", right_eye_center)
    print("Left eye center:", left_eye_center)

    return data, hr, ht, general_landmarks, camera_matrix,camera_distortion

if __name__=="__main__":
 #好きに変えてください
  for dir in os.listdir("/content/drive/MyDrive/Face_sample"):
    if dir.endswith(".jpg"):
      fetch_eyes(os.path.join("/content/drive/MyDrive/Face_sample",dir))

[[994.73532636   0.         624.66344095]
 [  0.         998.16646784 364.08742557]
 [  0.           0.           1.        ]] [[-0.16321888  0.66783406 -0.00121854 -0.00303158 -1.02159927]]
[[[-4.50967681e+01 -4.83773045e-01  2.39702984e+00]]

 [[-2.13128582e+01  4.83773045e-01 -2.39702984e+00]]

 [[ 2.13128582e+01  4.83773045e-01 -2.39702984e+00]]

 [[ 4.50967681e+01 -4.83773045e-01  2.39702984e+00]]

 [[-2.62995769e+01  6.85950353e+01 -9.86076132e-32]]

 [[ 2.62995769e+01  6.85950353e+01 -9.86076132e-32]]]
Right eye center: [-39.15079064  -0.24188652   1.19851492]
Left eye center: [-27.25883569   0.24188652  -1.19851492]
Right eye 3D position: [309.17509937  44.75130893 368.64569498]
Left eye 3D position: [316.38093984  44.52215028 358.87729028]
[[994.73532636   0.         624.66344095]
 [  0.         998.16646784 364.08742557]
 [  0.           0.           1.        ]] [[-0.16321888  0.66783406 -0.00121854 -0.00303158 -1.02159927]]
[[[-4.50967681e+01 -4.83773045e-01  2.39702984e+00

In [None]:
!pip install loguru



In [None]:
import torch
import numpy as np
import cv2
from cv2 import Rodrigues
from loguru import logger

def to2d_gaze(gaze):
    yaw = np.degrees(np.arctan2(-gaze[0], -gaze[2]))
    pitch = np.degrees(np.arcsin(-gaze[1]))
    return [yaw, pitch]

def to2d_head(head_pose):
    head_pose_rotated = Rodrigues(head_pose)[0][-1]
    yaw = np.degrees(np.arctan2(head_pose_rotated[0], head_pose_rotated[2]))
    pitch = -np.degrees(np.arcsin(head_pose_rotated[1]))
    return [yaw, pitch]

def angles_to_vector(yaw, pitch):
    """
    Convert yaw and pitch angles to a gaze vector.

    Parameters:
    yaw (float): Yaw angle (in degrees)
    pitch (float): Pitch angle (in degrees)

    Returns:
    torch.tensor: Gaze vector (x, y, z)
    """
    # Convert degrees to radians for trigonometric functions
    yaw = np.radians(yaw)
    pitch = np.radians(pitch)

    # Calculate gaze vector components
    x = np.cos(pitch) * np.cos(yaw)
    y = np.cos(pitch) * np.sin(yaw)
    z = np.sin(pitch)

    return torch.tensor([x, y, z])

def calculate_gaze_point(eye_position, gaze_vector):
    """
    Calculate the intersection of gaze vector with xy-plane (z=0).

    Parameters:
    eye_position (torch.tensor): Eye position (x, y, z)
    gaze_vector (torch.tensor): Gaze vector (x, y, z)

    Returns:
    torch.tensor: Intersection point on xy-plane (x, y)
    """
    if gaze_vector[2] == 0:
        return None  # Parallel to xy-plane, no intersection

    t = -eye_position[2] / gaze_vector[2]  # Solve for z=0
    gaze_x = eye_position[0] + t * gaze_vector[0]
    gaze_y = eye_position[1] + t * gaze_vector[1]

    return torch.tensor([gaze_x, gaze_y])

def get_spot(image_path):
    # Load camera calibration data
    fid = cv2.FileStorage('data-preprocessing-gaze/data/calibration/cameraCalib.xml', cv2.FileStorage_READ)
    camera_matrix = fid.getNode("camera_matrix").mat()
    camera_distortion = fid.getNode("cam_distortion").mat()

    # Load and undistort the image
    img_original = cv2.imread(image_path)
    img = cv2.undistort(img_original, camera_matrix, camera_distortion)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model and process eyes
    net = model()
    net.load_model("GazeNet/Iter_1_GazeNet.pt", device)

    data, hr, ht, landmarks, right_eye_3D, left_eye_3D = fetch_eyes(image_path)
    img_right = torch.from_numpy(data[0][0]).unsqueeze(0).unsqueeze(2).to(device).type(torch.FloatTensor)
    img_left = torch.from_numpy(data[1][0]).unsqueeze(0).unsqueeze(2).to(device).type(torch.FloatTensor)
    print("img_right",img_right)
    # Prepare eye and head pose data for inference
    eye_left = {"eye": img_left, "head_pose": torch.tensor([to2d_head(hr)]).to(device).type(torch.FloatTensor)}
    eye_right = {"eye": img_right, "head_pose": torch.tensor([to2d_head(hr)]).to(device).type(torch.FloatTensor)}

    # Predict gaze for left and right eye
    gaze_left_output = net(eye_left)  # The model returns a tensor of shape [2], [yaw, pitch]
    gaze_right_output = net(eye_right)

    # Split the output tensor into yaw and pitch (convert tensor to scalars for angles_to_vector)
    yaw_left, pitch_left = gaze_left_output[0][0].item(), gaze_left_output[0][1].item()
    yaw_right, pitch_right = gaze_right_output[0][0].item(), gaze_right_output[0][1].item()
    print("gaze_left:",yaw_left,pitch_left)
    print("gaze_right:",yaw_right,pitch_right)
    # Convert yaw and pitch to gaze vectors
    gaze_left = angles_to_vector(yaw_left, pitch_left)
    gaze_right = angles_to_vector(yaw_right, pitch_right)

    # Calculate eye points using projection
    objectPoints = np.array([left_eye_3D, right_eye_3D]).reshape(-1, 1, 3)
    print(objectPoints)
    left_eye_point, right_eye_point = cv2.projectPoints(objectPoints, hr, ht, camera_matrix, camera_distortion)
    print(gaze_left,gaze_right)
    # Calculate gaze points
    left_point = calculate_gaze_point(left_eye_3D, gaze_left)
    right_point = calculate_gaze_point(right_eye_3D, gaze_right)

    return left_point, right_point

# Execute get_spot on a sample image
logger.warning(get_spot("/content/drive/MyDrive/Face_sample/down_left.jpg"))#(tensor([5604.6494,  -29.9152], dtype=torch.float64), tensor([5919.3479, -601.9444], dtype=torch.float64))
logger.warning(get_spot("/content/drive/MyDrive/Face_sample/down_right.jpg"))#(tensor([2993.7550,  767.3470], dtype=torch.float64), tensor([17490.5185,  -998.2346], dtype=torch.float64))
logger.warning(get_spot("/content/drive/MyDrive/Face_sample/top_left.jpg"))#(tensor([10789.9999,   627.1514], dtype=torch.float64), tensor([15016.3676, -1983.7483], dtype=torch.float64))
logger.warning(get_spot("/content/drive/MyDrive/Face_sample/top_right.jpg"))#(tensor([11557.7793,  3258.8042], dtype=torch.float64), tensor([19544.6680,  2486.4446], dtype=torch.float64))
logger.warning(get_spot("/content/WIN_20241001_16_54_37_Pro.jpg"))#(tensor([2715.8031,  224.1377], dtype=torch.float64), tensor([2597.8703, -316.0890], dtype=torch.float64))
logger.warning(get_spot("/content/WIN_20241001_18_18_13_Pro.jpg"))
logger.warning(get_spot("/content/WIN_20241001_18_18_23_Pro.jpg"))
logger.warning(get_spot("/content/WIN_20241001_18_45_36_Pro.jpg"))
logger.warning(get_spot("/content/WIN_20241001_19_01_43_Pro.jpg"))

  self.load_state_dict(torch.load(path,map_location=device))


[[994.73532636   0.         624.66344095]
 [  0.         998.16646784 364.08742557]
 [  0.           0.           1.        ]] [[-0.16321888  0.66783406 -0.00121854 -0.00303158 -1.02159927]]
[[[-4.50967681e+01 -4.83773045e-01  2.39702984e+00]]

 [[-2.13128582e+01  4.83773045e-01 -2.39702984e+00]]

 [[ 2.13128582e+01  4.83773045e-01 -2.39702984e+00]]

 [[ 4.50967681e+01 -4.83773045e-01  2.39702984e+00]]

 [[-2.62995769e+01  6.85950353e+01 -9.86076132e-32]]

 [[ 2.62995769e+01  6.85950353e+01 -9.86076132e-32]]]
Right eye center: [-39.15079064  -0.24188652   1.19851492]
Left eye center: [-27.25883569   0.24188652  -1.19851492]
Right eye 3D position: [-73.710247   -49.78488603 660.86892239]
Left eye 3D position: [-61.58693488 -49.16659421 661.07239115]
img_right tensor([[[[ 90.,  94.,  94.,  ..., 154., 148., 148.]],

         [[ 90.,  90.,  90.,  ..., 161., 154., 154.]],

         [[ 90.,  90.,  90.,  ..., 170., 170., 170.]],

         ...,

         [[224., 224., 216.,  ..., 242., 224., 2

RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 36, 1, 60] to have 1 channels, but got 36 channels instead