In [1]:
from IPython.display import clear_output

In [2]:
!pip install open-clip-torch
!pip install faiss-cpu
!pip install faiss-gpu
clear_output()

In [3]:
import os, cv2, faiss, torch, numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display

In [4]:
BACKEND = os.environ.get("MM_BACKEND", "siglip2")  # "openclip" | "siglip2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if BACKEND == "openclip":
    import open_clip
    OC_MODEL = "ViT-g-14"
    OC_PRETRAINED = "laion2b_s34b_b88k"
    model, _, preprocess = open_clip.create_model_and_transforms(
        OC_MODEL, pretrained=OC_PRETRAINED, device=DEVICE
    )
    tokenizer = open_clip.get_tokenizer(OC_MODEL)
    model.eval()

elif BACKEND == "siglip2":
    # pip install -U transformers accelerate bitsandbytes
    from transformers import AutoProcessor, AutoModel
    CKPT = os.environ.get("SIGLIP2_CKPT", "google/siglip2-giant-opt-patch16-384")
    processor = AutoProcessor.from_pretrained(CKPT)
    model = AutoModel.from_pretrained(CKPT, device_map="auto").eval()
else:
    raise ValueError("BACKEND phải là 'openclip' hoặc 'siglip2'.")

2025-08-24 17:27:17.613981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756056437.924997      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756056438.014423      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.49G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
@torch.no_grad()
def embed_images(image_paths, batch_size=64):
    """
    frames: list[np.ndarray(H,W,3) in RGB]
    return: np.ndarray [N, D] đã L2-norm
    """
    if BACKEND == "openclip":
        vecs = []
        for i in range(0, len(image_paths), batch_size):
            batch = [preprocess(Image.open(path)) for path in image_paths[i:i+batch_size]]
            batch = torch.stack(batch).to(DEVICE)
            feats = model.encode_image(batch)
            feats = feats / feats.norm(dim=-1, keepdim=True)
            vecs.append(feats.float().cpu().numpy())
        return np.vstack(vecs)

    elif BACKEND == "siglip2":
        from PIL import Image as _Image
        vecs = []
        pil_frames = [Image.open(path) for path in image_paths]
        for i in range(0, len(pil_frames), batch_size):
            batch = pil_frames[i:i+batch_size]
            inputs = processor(images=batch, return_tensors="pt").to(model.device)
            img_feats = model.get_image_features(**inputs)  # (B, D)
            img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)
            vecs.append(img_feats.float().cpu().numpy())
        return np.vstack(vecs)

In [22]:
@torch.no_grad()
def embed_query_vi(text, en_hint=None):
    """
    Trả về vector 1xD đã L2-norm; nếu có en_hint -> lấy max giữa 2 biến thể.
    """
    if BACKEND == "openclip":
        texts = [text] + ([en_hint] if en_hint else [])
        toks = tokenizer(texts).to(DEVICE)
        feats = model.encode_text(toks)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        feat = torch.max(feats, dim=0).values
        return feat.float().cpu().numpy()[None, :]

    elif BACKEND == "siglip2":
        texts = [text.lower()] + ([en_hint.lower()] if en_hint else [])
        inputs = processor(
            text=texts, return_tensors="pt",
            padding="max_length", max_length=128
        ).to(model.device)
        txt_feats = model.get_text_features(**inputs)
        txt_feats = txt_feats / txt_feats.norm(dim=-1, keepdim=True)
        feat = torch.max(txt_feats, dim=0).values
        return feat.float().cpu().numpy()[None, :]

In [7]:
# ======= FAISS =======
def build_index(vecs: np.ndarray):
    faiss.normalize_L2(vecs) 
    idx = faiss.IndexFlatIP(vecs.shape[1])
    idx.add(vecs)
    return idx

In [8]:
# ======= SEARCH =======
def search_frames(image_paths, query_vi, query_en_hint=None,
                  topk=10, save_dir="hits", show=True):
    os.makedirs(save_dir, exist_ok=True)

    print(f"Số frame sample: {len(image_paths)}")
    if not image_paths:
        return []

    print("Nhúng ảnh...")
    img_vecs = embed_images(image_paths)
    index = build_index(img_vecs)

    print("Nhúng truy vấn...")
    qv = embed_query_vi(query_vi, query_en_hint)

    print("Tìm top-k...")
    D, I = index.search(qv.astype("float32"), topk)
    I, D = I[0].tolist(), D[0].tolist()

    base = os.path.splitext(os.path.basename(image_paths[0]))[0]
    results = []
    for rank, (idx, score) in enumerate(zip(I, D), 1):
        thumb = Image.open(image_paths[idx])
        out = os.path.join(save_dir, f"{base}_rank{rank:02d}_score{score:.3f}.jpg")
        thumb.save(out, quality=92)
        results.append({
            "rank": rank, "similarity": float(score),
            "thumb": out, "frame_index": int(idx)
        })
        if show:
            print(f"Rank {rank} | score {score:.3f} | {out}")
            display(thumb)

    return results


In [9]:
import os

In [10]:
file_path = '/kaggle/input/aic-small-2024/Keyframes_L21/keyframes/L21_V001'
image_names = os.listdir(file_path)
image_paths = []
for i in range(len(image_names)):
    image_paths.append(os.path.join(file_path, image_names[i]))

In [23]:
if __name__ == "__main__":
    query_vi = "Nhiều người mặc áo xanh dương trong phòng thí nghiệm"

    # Tìm kiếm các ảnh phù hợp với truy vấn:
    hits = search_frames(
        image_paths, query_vi, topk=10, save_dir="hits"
    )

    for h in hits:
        print(f"Rank {h['rank']} | score {h['similarity']:.3f}")
        img = Image.open(h["thumb"])
        display(img)

Số frame sample: 307
Nhúng ảnh...
Nhúng truy vấn...


ValueError: Sequence length must be less than max_position_embeddings (got `sequence length`: 128 and max_position_embeddings: 64