In [None]:
!wget http://images.cocodataset.org/zips/unlabeled2017.zip

--2026-02-19 20:56:36--  http://images.cocodataset.org/zips/unlabeled2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.51.57, 3.5.24.90, 16.15.176.39, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.51.57|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20126613414 (19G) [application/zip]
Saving to: ‘unlabeled2017.zip’


2026-02-19 21:15:26 (17.0 MB/s) - ‘unlabeled2017.zip’ saved [20126613414/20126613414]



In [None]:
!unzip unlabeled2017.zip > /dev/null

In [None]:
!pip install transformers torch torchvision pandas numpy pillow tqdm open_clip_torch

In [None]:
"""
Generate CLIP ViT-L/14 embeddings for COCO unlabeled2017 images
and save them to a Parquet file with file names.

Usage:
    python generate_clip_embeddings.py

Requirements:
    pip install torch torchvision open-clip-torch pandas pyarrow pillow tqdm
"""

import os
import glob
import torch
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import open_clip

# ── Config ────────────────────────────────────────────────────────────────────
IMAGE_DIR   = "./unlabeled2017"
OUTPUT_FILE = "./clip_embeddings.parquet"
BATCH_SIZE  = 64          # lower if you run out of VRAM
MODEL_NAME  = "ViT-L-14"
PRETRAINED  = "openai"    # uses OpenAI's original CLIP weights
# ─────────────────────────────────────────────────────────────────────────────

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load model
    print(f"Loading {MODEL_NAME} ({PRETRAINED}) …")
    model, _, preprocess = open_clip.create_model_and_transforms(
        MODEL_NAME, pretrained=PRETRAINED, device=device
    )
    model.eval()

    # Gather image paths
    extensions = ("*.jpg", "*.jpeg", "*.png", "*.webp")
    image_paths = []
    for ext in extensions:
        image_paths.extend(glob.glob(os.path.join(IMAGE_DIR, ext)))
    image_paths.sort()

    if not image_paths:
        raise FileNotFoundError(f"No images found in '{IMAGE_DIR}'")
    print(f"Found {len(image_paths):,} images")

    all_embeddings = []
    all_filenames  = []
    failed         = []

    # Process in batches
    for batch_start in tqdm(range(0, len(image_paths), BATCH_SIZE), desc="Encoding"):
        batch_paths = image_paths[batch_start : batch_start + BATCH_SIZE]

        tensors   = []
        filenames = []
        for path in batch_paths:
            try:
                img = Image.open(path).convert("RGB")
                tensors.append(preprocess(img))
                filenames.append(os.path.basename(path))
            except Exception as e:
                print(f"\n  ⚠  Skipping {path}: {e}")
                failed.append(path)

        if not tensors:
            continue

        batch_tensor = torch.stack(tensors).to(device)
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device == "cuda")):
            features = model.encode_image(batch_tensor)
            features = features / features.norm(dim=-1, keepdim=True)  # L2-normalise

        all_embeddings.append(features.cpu().float().numpy())
        all_filenames.extend(filenames)

    embeddings_np = np.concatenate(all_embeddings, axis=0)  # (N, 768)

    # Build DataFrame: one column for filename, one column holding the embedding array
    df = pd.DataFrame({
        "filename":  all_filenames,
        "embedding": list(embeddings_np),   # each cell is a (768,) numpy array
    })

    df.to_parquet(OUTPUT_FILE, index=False)
    print(f"\n✅  Saved {len(df):,} embeddings → {OUTPUT_FILE}")
    if failed:
        print(f"   ⚠  {len(failed)} images failed (see warnings above)")

    # Quick sanity check
    df_check = pd.read_parquet(OUTPUT_FILE)
    emb_shape = np.array(df_check["embedding"].iloc[0]).shape
    print(f"   Parquet rows: {len(df_check):,} | Embedding dim: {emb_shape}")


if __name__ == "__main__":
    main()

Using device: cuda
Loading ViT-L-14 (openai) …




Found 123,403 images


  with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device == "cuda")):
Encoding: 100%|██████████| 1929/1929 [47:41<00:00,  1.48s/it]



✅  Saved 123,403 embeddings → ./clip_embeddings.parquet
   Parquet rows: 123,403 | Embedding dim: (768,)
