# MobileNetV2 with Synthetic Face Data

Dataset from [Kaggle](https://www.kaggle.com/datasets/allexmendes/synthetic-gaze-and-face-segmentation/data)

This notebook trains a gaze prediction model using a dual-input CNN:
- **Input 1**: 224×224 face image (see the other notebook for preprocessing)
- **Input 2**: 4D pupil coordinate vector (`L_Pupil` and `R_Pupil`)
- **Output**: 2D normalized gaze direction vector (from `ImageEyesGazeDirection` annotation)

We use MobileNetV2 as the backbone for the visual stream and concatenate it with pupil coordinates before regression.

In [None]:
import os
import json
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Input, Concatenate, GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Load Dataset with Normalized Pupil Coordinates and Gaze Vectors

In [None]:
%pwd

In [None]:
DATA_DIR = '/home/ste/Documents/gesture-gaze-extension/datasets'
IMG_DIR = os.path.join(DATA_DIR, 'SynthGazeProcessed/images')
JSON_DIR = os.path.join(DATA_DIR, 'SynthGazeProcessed/json')
IMG_SIZE = (224, 224)
SEED = 42028

def load_dataset():
    images = []
    pupils = []
    labels = []

    for fname in sorted(os.listdir(JSON_DIR)):
        if not fname.endswith('.json'):
            continue

        json_path = os.path.join(JSON_DIR, fname)
        img_path = os.path.join(IMG_DIR, fname.replace('.json', '.png'))

        with open(json_path, 'r') as f:
            data = json.load(f)

        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.0

        l_pupil = data["Landmarks"]["L_Pupil"]
        r_pupil = data["Landmarks"]["R_Pupil"]
        # Normalize pupil coords
        norm_pupils = [
            l_pupil[0] / IMG_SIZE, l_pupil[1] / IMG_SIZE,
            r_pupil[0] / IMG_SIZE, r_pupil[1] / IMG_SIZE
        ]

        gaze = data["Overall"]["ImageEyesGazeDirection"]

        images.append(img)
        pupils.append(norm_pupils)
        labels.append(gaze)

    return np.array(images), np.array(pupils), np.array(labels)

X_img, X_pupil, y = load_dual_input_dataset()
X_img_train, X_img_val, X_pupil_train, X_pupil_val, y_train, y_val = train_test_split(
    X_img, X_pupil, y, test_size=0.2, random_state=SEED)

## Define Cosine Similarity Loss for Gaze Vectors

In [None]:
def cosine_loss(y_true, y_pred):
    y_true = tf.math.l2_normalize(y_true, axis=-1)
    y_pred = tf.math.l2_normalize(y_pred, axis=-1)
    return 1 - tf.reduce_sum(y_true * y_pred, axis=-1)

## Build the Dual-Input Gaze Prediction Model

In [None]:
def build_dual_input_model():
    img_input = Input(shape=(224, 224, 3), name='image_input')
    pupil_input = Input(shape=(4,), name='pupil_input')

    base_model = MobileNetV2(include_top=False, weights='imagenet', input_tensor=img_input)
    x = GlobalAveragePooling2D()(base_model.output)

    y = Dense(32, activation='relu')(pupil_input)

    combined = Concatenate()([x, y])
    z = Dense(128, activation='relu')(combined)
    z = Dropout(0.3)(z)
    output = Dense(2, activation='linear', name='gaze_output')(z)

    model = Model(inputs=[img_input, pupil_input], outputs=output)
    return model

model = build_dual_input_model()
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss=cosine_loss, metrics=['mae'])
model.summary()

## Train

In [None]:
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.keras', save_best_only=True)
]

history = model.fit(
    [X_img_train, X_pupil_train], y_train,
    validation_data=([X_img_val, X_pupil_val], y_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks
)

## Visualize Gaze Prediction on Validation Set

In [None]:
def draw_vector(img, vector, color=(0, 255, 0), scale=50):
    h, w = img.shape[:2]
    start = (w // 2, h // 2)
    end = (int(start[0] + vector[0]*scale), int(start[1] + vector[1]*scale))
    img_arrow = img.copy()
    cv2.arrowedLine(img_arrow, start, end, color, 2, tipLength=0.3)
    return img_arrow

n = 5
for i in range(n):
    img = (X_img_val[i] * 255).astype(np.uint8)
    true_vec = y_val[i]
    pred_vec = model.predict([X_img_val[i:i+1], X_pupil_val[i:i+1]])[0]

    vis_img = draw_vector(img, true_vec, color=(0, 255, 0))
    vis_img = draw_vector(vis_img, pred_vec, color=(255, 0, 0))

    plt.imshow(vis_img)
    plt.title("Green: GT, Red: Pred")
    plt.axis("off")
    plt.show()