<a href="https://colab.research.google.com/github/linainsaf/Dual-Attention-Guided-Gaze-Target-Detection-in-the-Wild/blob/main/3D_gaze_estimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Depth Estimation Network

In [1]:
import cv2
import torch
import urllib.request
import pickle
import matplotlib.pyplot as plt
import torch
import numpy as np

## Gazefollow 3D Gaze estimation

In [2]:
import torchvision
from torchvision import models 
from torchvision import transforms
import torch.nn as nn
from PIL import Image
from matplotlib import cm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torchvision import datasets
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load data and models

In [3]:
heads = pickle.load( open( "drive/MyDrive/MLA/imgs_heads_gaze_follow.pkl", "rb" ) )

removed =  []
for i in range(len(heads)):
  if  heads[i].any():
      heads[i] = cv2.resize(heads[i], dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
  else : 
      removed.append(i)
print(removed)

[22, 2026]


In [4]:
eyes_l,eyes_r = pickle.load( open( "drive/MyDrive/MLA/eyes.pkl", "rb" ) )

In [5]:
del heads[removed[0]]
del heads[removed[1]-1]
del heads[0]

del eyes_l[0]
del eyes_r[0]

len(eyes_r),len(eyes_l), len(heads)

(3997, 3997, 3997)

In [6]:
model_hp = torch.load("drive/MyDrive/MLA/head_pose_extractor.pth")#, map_location='cpu')

In [7]:
def createModel_resnet34(out_1, out_2):
    model = torchvision.models.resnet34(pretrained = True)
    num_ftrs = model.fc.in_features

    for param in model.parameters():
        param.requires_grad = False
    
    #Creating 3 Linear connected layers which can be trained
    fc1 = nn.Linear(num_ftrs, out_1)
    fc2 = nn.Linear(out_1, out_2)
    fc3 = nn.Linear(out_2, 2)

    layers = [fc1, fc2, fc3]
    for linearLayer in layers:
        #Applying He initialization to all layers
        nn.init.kaiming_uniform_(linearLayer.weight, nonlinearity='leaky_relu')
  


    #Setting Resnet's fully connected layer to our collection of three Linear layers with nn.Sequential
    model.fc = nn.Sequential(fc1, nn.LeakyReLU(),fc2, nn.LeakyReLU(),fc3)
    model.double() #double to set variables to double
    #Sending the device to the GPU if avaliable
    model.to(device)

    return model

In [8]:
def createModel_MLP(hidden,tensor_size):
    model = nn.Sequential(
      nn.Flatten(),
      nn.Linear(tensor_size, hidden),
      nn.ReLU(),
      #nn.Linear(hidden, 128),
      #nn.ReLU(),
      nn.Linear(hidden, 3 ),
    )
    
    model.double() #double to set variables to double
    #Sending the device to the GPU if avaliable
    model.to(device)

    return model

In [9]:
class eyeDataset(Dataset):
    """eye landmark dataset."""

    def __init__(self, imgs, transform=None):
       
        self.imgs = imgs
        self.transform = transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        #img_name = os.path.join(self.root_dir,self.landmarks_frame.iloc[idx, 0])
        image = self.imgs[idx] #io.imread(img_name)
        image = Image.fromarray(image)
        sample = {'image': image}

        if self.transform:
            sample['image'] = self.transform(sample['image'])

        return sample

In [10]:
def get_eye_features(left_eye_imgs, right_eye_imgs):
    # model import
    resnet = models.resnet18(pretrained=True)
    resnet18 = nn.Sequential(*(list(resnet.children())[:-1])) #take 8 layers 
    resnet18.to(device)
    # transfor data
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )])

    # left eye 
    l_eye_set=eyeDataset(imgs=left_eye_imgs,transform=preprocess)
    l_eye_loader = DataLoader(l_eye_set, batch_size=571, shuffle=False, num_workers=2)
    tensor_size = 512
    features_left =torch.zeros((1,tensor_size), dtype=torch.int32, device = 'cuda')

    for i in range(len(l_eye_loader)):
        l=next(iter(l_eye_loader))
        outputs_l=l['image'].to(device)
        left=resnet18(outputs_l).flatten(start_dim=1)
        features_left = torch.cat((features_left,left), 0)

    features_left=features_left[1:,:]

    # right eye
    r_eye_set=eyeDataset(imgs=right_eye_imgs,transform=preprocess)
    r_eye_loader = DataLoader(r_eye_set, batch_size=571, shuffle=False, num_workers=2)
    resnet18.eval()
    tensor_size = 512 #2048
    features_right =torch.zeros((1,tensor_size), dtype=torch.int32, device = 'cuda')
    for i in range(len(r_eye_loader)):
        r=next(iter(r_eye_loader))
        outputs_r=r['image'].to(device)
        right=resnet18(outputs_r).flatten(start_dim=1)# yields a tensor of size([batch_size, 2048])
        features_right = torch.cat((features_right,right), 0)

    features_right=features_right[1:,:]

    return features_left, features_right


In [11]:
class headposeDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, imgs, transform=None):
        self.imgs = imgs
        self.transform = transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = self.imgs[idx] 
        image = Image.fromarray(image)
        '''
        landmarks = np.zeros(2)
        landmarks=np.zeros(2)
        landmarks[0]= ex[0]
        landmarks[1]= ex[1]
        '''
        if self.transform:
            image = self.transform(image)

        sample = {'image': image}

        return sample

In [12]:
class MLPDataset(Dataset):
    """prep_data_for_MLP."""

    def __init__(self, data_in, transform=None):
        self.data_in = data_in
        self.transform = transform

    def __len__(self):
        return len(self.data_in)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x = self.data_in[idx]

        if self.transform:
              x = self.transform(x)
        return x

In [13]:
def head_pose_extraction(head_imgs):
  batch_size= 128 #32
  preprocess = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x.double()),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
  
  head_tensor = headposeDataset(imgs=head_imgs, transform=preprocess)
  head_tensor_batched = torch.utils.data.DataLoader(head_tensor, batch_size=batch_size, num_workers=2)

  # head pose prediction
  model_hp = createModel_resnet34(512, 128)
  model_hp.load_state_dict(torch.load("drive/MyDrive/MLA/head_pose_extractor.pth"))#,map_location=torch.device('cpu')))
  model_hp.eval() 

  prediction_hp =torch.zeros((1,2), dtype=torch.int32, device = 'cuda')

  for i, batch in enumerate(head_tensor_batched):
    prediction = model_hp(batch["image"].to(device))
    prediction_hp = torch.cat((prediction_hp,prediction), 0)
  prediction_hp=prediction_hp[1:,:]
  return prediction_hp


In [14]:
#we multiply this operator with the features extracted from the resnet
def l_op(l_imgs,r_imgs): 
    l_imgs = torch.as_tensor(np.array(l_imgs)).cuda()
    r_imgs = torch.as_tensor(np.array(r_imgs)).cuda()
    eyes_b =l_imgs.sum(axis = (3,2,1)) +r_imgs.sum(axis = (3,2,1))
    lop = (eyes_b).bool()
    lop = lop.int()
    return lop

def lop_eyes_features( lop, eyes_features):
    x= eyes_features
    for i in range(len(lop)):
      x[i] = torch.mul(eyes_features[i],lop[i])
    return x


def mlp_gaze_estimation(prediction_hp, features_left, features_right, left_eye_imgs, right_eye_imgs):
  batch_size= 128
  #prepare head position input
  h = torch.as_tensor(prediction_hp).cuda()
  #prepare eyes features : concatenation + multiplication with l operator 
  eyes_features = torch.cat((features_left, features_right),1)
  lop = l_op(left_eye_imgs, right_eye_imgs)
  # Multiply eyes features and L operator
  lop_EyesFeatures = lop_eyes_features(lop,eyes_features)
  # Concatenate eyes features with h
  input_data = torch.cat((h,lop_EyesFeatures), 1)
  #Normalize the concatenated tensor
  input= input_data.clone().detach()
  input = ((input.T - input.mean(axis = 1))/input.std(axis = 1)).T

  #preprocess data 
  preprocess = transforms.Compose([transforms.Lambda(lambda x: x.double())])
  dataset = MLPDataset(input, transform=preprocess)
  dataset_batched = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=2)

  #Model import
  tensor_size= len(features_left[0])*2+len(prediction_hp[0])
  model_gaze = createModel_MLP(256,tensor_size)
  model_gaze.load_state_dict(torch.load("drive/MyDrive/MLA/MLP.pth"))#,map_location=torch.device('cpu')))
  model_gaze.eval() 

  gaze_prediction =torch.zeros((1,2), dtype=torch.int32, device = 'cuda')

  for i, batch in enumerate(dataset_batched):
    prediction = model_gaze(batch.to(device))
    gaze_prediction = torch.cat((gaze_prediction,prediction), 0)
  gaze_prediction = gaze_prediction[1:,:]

  return gaze_prediction 

In [15]:
def gaze_estimation(head_imgs, left_eye_imgs, right_eye_imgs):
    # head pose prediction
    prediction_hp =  head_pose_extraction(head_imgs)
    print("head pose estimated")

    # Get eye features
    features_left, features_right = get_eye_features(left_eye_imgs, right_eye_imgs)
    print("eyes features extracted")

    # gaze prediction
    gaze_prediction = mlp_gaze_estimation(prediction_hp, features_left, features_right, left_eye_imgs, right_eye_imgs)
    print("3D gaze predicted")

    return gaze_prediction


In [None]:
prediction = gaze_estimation(heads, eyes_l, eyes_r)

head pose estimated
