# Step -1: Setup

In [27]:
# system/file stuff
import sys
import os
import inspect
from pathlib import Path
import gc

# datasci stuff
import pandas as pd
import scipy.io as sio
import numpy as np

# computer vision stuff
import cv2
from PIL import Image

# ML stuff
import torch
from tqdm import tqdm # bc i need progress bars
from torch.utils.data import Dataset

# detect face/eye regions of image
import mediapipe as mp

# augmentation
import albumentations as A

# Step 0: Establish Dummy Pipeline

In [2]:
itracker_path = Path("/Volumes/Crucial X10/210/GazeCapture/pytorch")
sys.path.append(str(itracker_path)) 

# sanity
print("ITrackerModel.py exists?", (itracker_path / "ITrackerModel.py").exists())
print("checkpoint exists?", (itracker_path / "checkpoint.pth.tar").exists())

ITrackerModel.py exists? True
checkpoint exists? True


In [17]:
# build an empty iTracker network (random weights).”
from ITrackerModel import ITrackerModel
device = "cpu"
model = ITrackerModel().to(device)

In [18]:
# load pretrained weights into model
ckpt = torch.load(itracker_path / "checkpoint.pth.tar", map_location=device)
state = ckpt["state_dict"] if isinstance(ckpt, dict) and "state_dict" in ckpt else ckpt
missing, unexpected = model.load_state_dict(state, strict=False)

In [19]:
# sanity check
print("missing:", missing)
print("unexpected:", unexpected)

missing: []
unexpected: []


In [20]:
model.eval()

B = 1
faces = torch.randn(B, 3, 224, 224, device=device)
eyesL = torch.randn(B, 3, 224, 224, device=device)
eyesR = torch.randn(B, 3, 224, 224, device=device)
faceGrid = torch.randn(B, 25, 25, device=device)  # matches gridSize=25

with torch.no_grad():
    out = model(faces, eyesL, eyesR, faceGrid)

print("out shape:", out.shape)
print("out:", out)

out shape: torch.Size([1, 2])
out: tensor([[-0.1647, -9.3985]])


# Step 1: Confirm Expected Inputs

Confirm expected inputs: face/left/right tensor shape, dtype, normalization steps (mean images).

This information was used to help create eye2voice/code/ceal_itracker/data.py

In [23]:
mean_face = scipy.io.loadmat(itracker_path / "mean_face_224.mat")["image_mean"]
mean_left = scipy.io.loadmat(itracker_path / "mean_left_224.mat")["image_mean"]
mean_right = scipy.io.loadmat(itracker_path / "mean_right_224.mat")["image_mean"]

print("face:", mean_face.shape, mean_face.dtype, float(mean_face.min()), float(mean_face.max()))
print("left:", mean_left.shape, mean_left.dtype, float(mean_left.min()), float(mean_left.max()))
print("right:", mean_right.shape, mean_right.dtype, float(mean_right.min()), float(mean_right.max()))

face: (224, 224, 3) float32 79.87755584716797 180.56549072265625
left: (224, 224, 3) float32 72.00033569335938 191.53338623046875
right: (224, 224, 3) float32 71.83743286132812 177.91868591308594


# Step 2: Augmentation + Cropping Pipeline

produce face/left/right 224×224 crops for CEAL (using CEAL metadata or a detector).

for now, to get the pipeline up and running, we will use CEAL metadata to determine how to crop images. this works because the original images are so consistent with the camera always 2m away and the subjects positioned uniformly across images. 

we will need to implement a detector model thing later to automatically detect face/eye regions in order to perform the cropping.

jk implementing that now bc the other way got too complex and i knew we would need this anyway

this is required to match how iTracker model ingests information and produces output

## Step 2.0: Metadata

File name parsing and label creation

## Step 2.1: Landmark Detection

Use MediaPipe Face Mesh (fast, easy to install, no compiling dlib misery).

Output: face bbox + left-eye region bbox + right-eye region bbox (or keypoints you convert to bboxes)

Limitation: the repo’s original crops come from Apple’s detector JSONs. Your MediaPipe boxes will be different. That’s okay, but don’t expect perfect pretrained performance unless your crop style is similar. (our crop style is probably going to be similar, i hope?)

## Step 2.2: Augment full image with keypoints

We need augmentation to be consistent across our 3 crops (face, left eye, right eye)

## Step 2.3: Crop & Resize

Crop face, left_eye, right_eye from augmented full image

Resize each to (224,224)

create correct manifest csv to ensure the crops that belong to the same face stay together

## Step 2.4: Facegrid

Create a 25×25 binary array, set to 1s in the region corresponding to the face bbox projected into that grid.

Yes this is another iTracker requirement

uhg the original preprocessing pipeline did not use pixels but instead used grid coordinates. so new plan:

Compute (grid_x, grid_y, grid_w, grid_h) in 25×25 units from the face bbox in full-frame coords.

Then precompute and store the 625-length flattened grid. (where is this used later? in data.py?)

## Step 2.5: Save Crops and Facegrid

make sure the tensors you return match:

return row, imFace, imEyeL, imEyeR, faceGrid, label

save in OUTPUT_ROOT = Path("/Volumes/Crucial X10/210/data/ceal_augmented_cropped")

ensure data.py matches

ensure the crops that belong to the same face stay together

## Step 2.6: 

In [None]:
# ingest image
# crop face
# crop eyes
# augment 3 crops together

In [None]:
def clamp(val, lo, hi):
    return max(lo, min(hi, val))

def crop_with_padding(img: Image.Image, box, pad_frac=0.08) -> Image.Image:
    """
    img: PIL.Image RGB
    box: (x1, y1, x2, y2) in pixel coords
    pad_frac: fraction of box size to pad on each side
    """
    W, H = img.size
    x1, y1, x2, y2 = box
    w = x2 - x1
    h = y2 - y1

    pad_x = int(w * pad_frac)
    pad_y = int(h * pad_frac)

    x1p = clamp(x1 - pad_x, 0, W)
    y1p = clamp(y1 - pad_y, 0, H)
    x2p = clamp(x2 + pad_x, 0, W)
    y2p = clamp(y2 + pad_y, 0, H)

    return img.crop((x1p, y1p, x2p, y2p))


In [None]:
def derive_eye_boxes_from_face(face_box):
    """
    face_box: (x1, y1, x2, y2) in original image coords
    returns: (left_eye_box, right_eye_box) as pixel boxes

    These ratios are “reasonable defaults” for a forward-facing face:
        Eyes live in the upper half of the face crop
        Left eye is in the left half; right eye in the right half
        We’ll take a rectangle that is ~40% of face width and ~25% of face height
    """
    x1, y1, x2, y2 = face_box
    w = x2 - x1
    h = y2 - y1

    # region where eyes likely live (upper portion)
    eye_region_top = y1 + int(0.18 * h)
    eye_region_bot = y1 + int(0.55 * h)

    # each eye box width/height
    eye_w = int(0.40 * w)
    eye_h = eye_region_bot - eye_region_top

    # centers for left/right eyes
    left_cx  = x1 + int(0.33 * w)
    right_cx = x1 + int(0.67 * w)

    def box_from_center(cx):
        ex1 = cx - eye_w // 2
        ex2 = cx + eye_w // 2
        ey1 = eye_region_top
        ey2 = eye_region_bot
        return (ex1, ey1, ex2, ey2)

    return box_from_center(left_cx), box_from_center(right_cx)


In [None]:
def ceal_crops_from_face_box(img_rgb: Image.Image, face_box):
    """
    img_rgb: PIL.Image in RGB
    face_box: (x1, y1, x2, y2) in original image coordinates
    returns: face_pil, left_pil, right_pil
    """
    # Face crop (pad a bit)
    face_pil = crop_with_padding(img_rgb, face_box, pad_frac=0.06)

    # Eye boxes derived in original coordinates, then cropped from original image
    left_box, right_box = derive_eye_boxes_from_face(face_box)

    left_pil  = crop_with_padding(img_rgb, left_box, pad_frac=0.10)
    right_pil = crop_with_padding(img_rgb, right_box, pad_frac=0.10)

    return face_pil, left_pil, right_pil

In [None]:
def manual_crops(img_rgb):
    W, H = img_rgb.size

    # Face region (ratios tuned for centered head + chin rest)
    face_box = (int(0.30*W), int(0.12*H), int(0.70*W), int(0.92*H))

    # Eye band inside the face region
    ex1, ey1, ex2, ey2 = (int(0.33*W), int(0.30*H), int(0.67*W), int(0.55*H))
    eye_band = img_rgb.crop((ex1, ey1, ex2, ey2))
    bw, bh = eye_band.size

    left_eye  = eye_band.crop((0, 0, bw//2, bh))
    right_eye = eye_band.crop((bw//2, 0, bw, bh))

    face = img_rgb.crop(face_box)

    return face, left_eye, right_eye


# Step 3: Augmentation Pipeline

In [None]:
def save_rgb_pil(pil_img: Image.Image, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    pil_img.save(path, format="JPEG", quality=95)

def build_itracker_crops_manifest(df, output_root: Path, num_augmentations: int):
    """
    df columns expected (adapt as needed):
      - path (original image path)
      - subject
      - filename or stem
      - label
    """
    output_root = Path(output_root)
    rows = []

    for _, r in df.iterrows():
        img_bgr = cv2.imread(r["path"])
        if img_bgr is None:
            continue
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        pil = Image.fromarray(img_rgb).convert("RGB")

        stem = Path(r.get("filename", Path(r["path"]).name)).stem
        subj = str(r["subject"])

        # ---- 1) Crop first (YOU plug in your logic) ----
       
        # determine bos to use for cropping around face
        W, H = pil.size
        face_box = (
            int(0.15 * W),
            int(0.12 * H),
            int(0.85 * W),
            int(0.92 * H),
)
        face_pil, left_pil, right_pil = ceal_crops_from_face_box(pil)

        # ---- 2) Save original crops (aug_id=0) ----
        base_dir = output_root / subj / stem
        face0 = base_dir / "face_000.jpg"
        left0 = base_dir / "left_000.jpg"
        right0 = base_dir / "right_000.jpg"

        save_rgb_pil(face_pil.resize((224, 224)), face0)
        save_rgb_pil(left_pil.resize((224, 224)), left0)
        save_rgb_pil(right_pil.resize((224, 224)), right0)

        rows.append({
            **r.to_dict(),
            "aug_id": 0,
            "is_augmented": False,
            "face_path": str(face0),
            "left_path": str(left0),
            "right_path": str(right0),
        })

        # ---- 3) Augment the CROPS (keep in sync) ----
        for k in range(1, num_augmentations + 1):
            face_aug, left_aug, right_aug, label_aug = augment_triplet(
                face_pil, left_pil, right_pil, r["label"]
            )

            facek = base_dir / f"face_{k:03d}.jpg"
            leftk = base_dir / f"left_{k:03d}.jpg"
            rightk = base_dir / f"right_{k:03d}.jpg"

            save_rgb_pil(face_aug.resize((224, 224)), facek)
            save_rgb_pil(left_aug.resize((224, 224)), leftk)
            save_rgb_pil(right_aug.resize((224, 224)), rightk)

            rows.append({
                **r.to_dict(),
                "label": label_aug,
                "aug_id": k,
                "is_augmented": True,
                "face_path": str(facek),
                "left_path": str(leftk),
                "right_path": str(rightk),
            })

    return pd.DataFrame(rows)


### Debug start

In [8]:
from pathlib import Path
from ITrackerData import MEAN_PATH

mean_dir = Path(MEAN_PATH)
print("MEAN_PATH:", mean_dir)
print("exists:", mean_dir.exists())
print("contents sample:", [p.name for p in mean_dir.glob("*")][:20])

mean_file = mean_dir / "mean_face_224.mat"
print("mean_face exists:", mean_file.exists())
print("mean_face path:", mean_file)


MEAN_PATH: .
exists: True
contents sample: ['step_0.ipynb']
mean_face exists: False
mean_face path: mean_face_224.mat


### debug end