```mermaid
flowchart LR

classDef notebook fill:#5c7fa6,stroke:#3f5a7b,color:#f2f6fb,font-weight:bold;
classDef python fill:#9a80b8,stroke:#6d5789,color:#f7f3fb,font-weight:bold;
classDef tools fill:#e9c48a,stroke:#b58950,color:#2d1c05;
classDef methods fill:#8cc7ab,stroke:#5e9475,color:#0f2f1f;

N02["02_embeddings.ipynb"]:::notebook

N02 --> E1["embed_clip.py"]:::python

E1 --> T2["Tools:<br>torch<br>transformers<br>Pillow<br>polars<br>numpy"]:::tools

T2 --> M2["Methods:<br>load_model()<br>extract_embedding()<br>run_embedding()<br>extract_clip_embeddings()<br>get_clip_embedding()"]:::methods


In [1]:
import os, sys

# Force notebook to run as if cwd = project root
PROJECT_ROOT = "/Users/jayklarin/__DI/Repositories/FaceStats"
os.chdir(PROJECT_ROOT)
print("cwd:", os.getcwd())

# Add src/ to Python import path
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
sys.path.insert(0, SRC_PATH)
print("src path added:", SRC_PATH)


cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


In [2]:
import torch
from transformers import CLIPProcessor

device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using:", device)

processor = None  # We will re-bind it after construction

class MPSProcessorWrapper:
    """
    Wraps the processor *output only* so no recursion happens.
    """

    def __init__(self, real_processor):
        self.real = real_processor

    def __getattr__(self, name):
        return getattr(self.real, name)

    def __call__(self, *args, **kwargs):
        out = self.real(*args, **kwargs)

        # Move only tensors in the output batch to MPS
        for k, v in out.items():
            if torch.is_tensor(v):
                out[k] = v.to(device)
        return out

# Build processor normally
_real = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Wrap it safely (no recursion possible)
processor = MPSProcessorWrapper(_real)

print("Wrapped CLIPProcessor outputs to MPS.")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using: mps
Wrapped CLIPProcessor outputs to MPS.


In [3]:
from embeddings.embed_clip import extract_clip_embeddings

INPUT_DIR = "data/processed/preproc"
OUTPUT_FILE = "data/processed/embeddings/embeddings_clip.parquet"

emb_df = extract_clip_embeddings(
    input_dir=INPUT_DIR,
    output_path=OUTPUT_FILE,
    model_name="openai/clip-vit-base-patch32"
)

emb_df.head()
# 35 minutes for 10,000 images on cpu without mps acceleration
# About 25 minutes on mps

Saved 10000 embeddings → data/processed/embeddings/embeddings_clip.parquet


filename,embedding
str,list[f64]
"""SFHQ_pt4_00086092.jpg""","[0.040853, -0.001176, … -0.034655]"
"""SFHQ_pt4_00065309.jpg""","[0.030933, 0.005503, … 0.001794]"
"""SFHQ_pt4_00062466.jpg""","[0.071319, -0.005249, … 0.051021]"
"""SFHQ_pt4_00090828.jpg""","[0.066199, -0.003542, … 0.020381]"
"""SFHQ_pt4_00032251.jpg""","[0.021252, -0.060645, … -0.004985]"
