In [1]:
from IPython.display import clear_output

In [2]:
!pip install open-clip-torch
!pip install faiss-cpu
!pip install faiss-gpu
clear_output()

In [3]:
import os, cv2, faiss, torch, numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display

In [4]:
BACKEND = os.environ.get("MM_BACKEND", "openclip")  # "openclip" | "siglip2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if BACKEND == "openclip":
    import open_clip
    OC_MODEL = "ViT-g-14"
    OC_PRETRAINED = "laion2b_s34b_b88k"
    model, _, preprocess = open_clip.create_model_and_transforms(
        OC_MODEL, pretrained=OC_PRETRAINED, device=DEVICE
    )
    tokenizer = open_clip.get_tokenizer(OC_MODEL)
    model.eval()

elif BACKEND == "siglip2":
    # pip install -U transformers accelerate bitsandbytes
    from transformers import AutoProcessor, AutoModel
    CKPT = os.environ.get("SIGLIP2_CKPT", "google/siglip2-giant-opt-patch16-384")
    processor = AutoProcessor.from_pretrained(CKPT)
    model = AutoModel.from_pretrained(CKPT, device_map="auto").eval()
else:
    raise ValueError("BACKEND phải là 'openclip' hoặc 'siglip2'.")

2025-08-25 12:20:04.897392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756124405.112894      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756124405.177005      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.49G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
@torch.no_grad()
def embed_images(image_paths, batch_size=64):
    """
    frames: list[np.ndarray(H,W,3) in RGB]
    return: np.ndarray [N, D] đã L2-norm
    """
    if BACKEND == "openclip":
        vecs = []
        for i in range(0, len(image_paths), batch_size):
            batch = [preprocess(Image.open(path)) for path in image_paths[i:i+batch_size]]
            batch = torch.stack(batch).to(DEVICE)
            feats = model.encode_image(batch)
            feats = feats / feats.norm(dim=-1, keepdim=True)
            vecs.append(feats.float().cpu().numpy())
        return np.vstack(vecs)

    elif BACKEND == "siglip2":
        from PIL import Image as _Image
        vecs = []
        pil_frames = [Image.open(path) for path in image_paths]
        for i in range(0, len(pil_frames), batch_size):
            batch = pil_frames[i:i+batch_size]
            inputs = processor(images=batch, return_tensors="pt").to(model.device)
            img_feats = model.get_image_features(**inputs)  # (B, D)
            img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)
            vecs.append(img_feats.float().cpu().numpy())
        return np.vstack(vecs)

In [6]:
@torch.no_grad()
def embed_query_vi(text, en_hint=None):
    """
    Trả về vector 1xD đã L2-norm; nếu có en_hint -> lấy max giữa 2 biến thể.
    """
    if BACKEND == "openclip":
        texts = [text] + ([en_hint] if en_hint else [])
        toks = tokenizer(texts).to(DEVICE)
        feats = model.encode_text(toks)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        feat = torch.max(feats, dim=0).values
        return feat.float().cpu().numpy()[None, :]

    elif BACKEND == "siglip2":
        texts = [text.lower()] + ([en_hint.lower()] if en_hint else [])
        inputs = processor(
            text=texts, return_tensors="pt",
            padding="max_length", max_length=64
        ).to(model.device)
        txt_feats = model.get_text_features(**inputs)
        txt_feats = txt_feats / txt_feats.norm(dim=-1, keepdim=True)
        feat = torch.max(txt_feats, dim=0).values
        return feat.float().cpu().numpy()[None, :]

In [7]:
# ======= FAISS =======
def build_index(vecs: np.ndarray):
    faiss.normalize_L2(vecs) 
    idx = faiss.IndexFlatIP(vecs.shape[1])
    idx.add(vecs)
    return idx

In [8]:
# # ======= SEARCH =======
# def search_frames(image_paths, query_vi, query_en_hint=None,
#                   topk=10, save_dir="hits", show=True):
#     os.makedirs(save_dir, exist_ok=True)

#     print(f"Số frame sample: {len(image_paths)}")
#     if not image_paths:
#         return []

#     print("Nhúng ảnh...")
#     img_vecs = embed_images(image_paths)
#     index = build_index(img_vecs)

#     print("Nhúng truy vấn...")
#     qv = embed_query_vi(query_vi, query_en_hint)

#     print("Tìm top-k...")
#     D, I = index.search(qv.astype("float32"), topk)
#     I, D = I[0].tolist(), D[0].tolist()

#     base = os.path.splitext(os.path.basename(image_paths[0]))[0]
#     results = []
#     for rank, (idx, score) in enumerate(zip(I, D), 1):
#         thumb = Image.open(image_paths[idx])
#         out = os.path.join(save_dir, f"{base}_rank{rank:02d}_score{score:.3f}.jpg")
#         thumb.save(out, quality=92)
#         results.append({
#             "rank": rank, "similarity": float(score),
#             "thumb": out, "frame_index": int(idx)
#         })
#         if show:
#             print(f"Rank {rank} | score {score:.3f} | {out}")
#             display(thumb)

#     return results


In [9]:
import pickle

def save_image_embeddings(image_paths, save_path="image_embeddings.pkl"):
    print(f"Lưu embeddings của {len(image_paths)} ảnh...")
    
    # Nhúng ảnh
    img_vecs = embed_images(image_paths)
    
    # Lưu embeddings vào file
    with open(save_path, 'wb') as f:
        pickle.dump(img_vecs, f)
    
    print(f"Đã lưu embeddings vào {save_path}")


In [10]:
import pickle
import numpy as np
import os
from PIL import Image

def search_frames_with_saved_embeddings(image_paths, query_vi, query_en_hint=None, 
                                        topk=10, save_dir="hits", embedding_path="image_embeddings.pkl", show=True):
    # Kiểm tra nếu file embeddings đã tồn tại
    if not os.path.exists(embedding_path):
        raise FileNotFoundError(f"Không tìm thấy file embeddings: {embedding_path}")
    
    # Đọc embeddings đã lưu
    with open(embedding_path, 'rb') as f:
        img_vecs = pickle.load(f)
    
    # Tạo chỉ mục cho embeddings đã lưu
    index = build_index(img_vecs)

    print(f"Số frame sample: {len(image_paths)}")
    if not image_paths:
        return []

    print("Nhúng truy vấn...")
    qv = embed_query_vi(query_vi, query_en_hint)

    print("Tìm top-k...")
    D, I = index.search(qv.astype("float32"), topk)
    I, D = I[0].tolist(), D[0].tolist()

    base = os.path.splitext(os.path.basename(image_paths[0]))[0]
    results = []
    
    os.makedirs(save_dir, exist_ok=True)
    
    for rank, (idx, score) in enumerate(zip(I, D), 1):
        thumb = Image.open(image_paths[idx])
        out = os.path.join(save_dir, f"{base}_rank{rank:02d}_score{score:.3f}.jpg")
        thumb.save(out, quality=92)
        results.append({
            "rank": rank, "similarity": float(score),
            "thumb": out, "frame_index": int(idx)
        })
        if show:
            print(f"Rank {rank} | score {score:.3f} | {out}")
            display(thumb)

    return results


In [11]:
import os
from tqdm import tqdm

In [12]:
file_path = '/kaggle/input/aic-small-2024/Keyframes_L21/keyframes/L21_V001'
image_names = os.listdir(file_path)
image_paths = []
for i in range(len(image_names)):
    image_paths.append(os.path.join(file_path, image_names[i]))

In [13]:
import json
import cv2

json_files = ['/kaggle/input/aic-sample-test/keyframes_index/L21_V003_keyframes_index.json',
             '/kaggle/input/aic-sample-test/keyframes_index/L21_V006_keyframes_index.json',
             '/kaggle/input/aic-sample-test/keyframes_index/L21_V007_keyframes_index.json',
             '/kaggle/input/aic-sample-test/keyframes_index/L21_V011_keyframes_index.json'
            ]


video_paths = ['/kaggle/input/aic-sample-test/videos/L21_V003.mp4', 
               '/kaggle/input/aic-sample-test/videos/L21_V006.mp4',
               '/kaggle/input/aic-sample-test/videos/L21_V007.mp4',
               '/kaggle/input/aic-sample-test/videos/L21_V011.mp4'
              ] 


In [None]:
output_dir = 'extracted_frames'
os.makedirs(output_dir, exist_ok=True)

image_paths = []

for i, json_file in enumerate(json_files):
    with open(json_file, 'r') as f:
        frame_indices = json.load(f)

    cap = cv2.VideoCapture(video_paths[i])
    
    if not cap.isOpened():
        print(f"Không thể mở video: {video_paths[i]}")
        continue

    # tqdm để hiển thị tiến trình
    for frame_index in tqdm(frame_indices, desc=f"Trích xuất từ video {i+1}/{len(json_files)}"):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        ret, frame = cap.read()
        if not ret:
            print(f"⚠️ Không thể đọc frame tại index {frame_index} trong video {i+1}.")
            continue

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_image = Image.fromarray(frame_rgb)
        
        frame_path = os.path.join(output_dir, f'video{i+1}_frame_{frame_index}.png')
        frame_image.save(frame_path)
        image_paths.append(frame_path)

    cap.release()

print(f"Đã trích xuất tổng cộng {len(image_paths)} frame.")
print(f"Các đường dẫn frame:", image_paths)

Trích xuất từ video 1/4:   5%|▍         | 16/349 [00:03<01:32,  3.59it/s][h264 @ 0x3abe98c0] mmco: unref short failure
Trích xuất từ video 1/4:   5%|▍         | 17/349 [00:04<01:21,  4.07it/s][h264 @ 0x3abe98c0] mmco: unref short failure
Trích xuất từ video 1/4:   5%|▌         | 18/349 [00:04<01:14,  4.44it/s][h264 @ 0x3abe98c0] mmco: unref short failure
Trích xuất từ video 1/4:  92%|█████████▏| 321/349 [01:36<00:08,  3.40it/s][h264 @ 0x3abe98c0] mmco: unref short failure
Trích xuất từ video 1/4: 100%|██████████| 349/349 [01:43<00:00,  3.36it/s]
Trích xuất từ video 2/4:  19%|█▉        | 72/384 [00:20<01:27,  3.58it/s][h264 @ 0x3ab5b1c0] mmco: unref short failure
Trích xuất từ video 2/4:  36%|███▋      | 140/384 [00:43<01:21,  2.98it/s][h264 @ 0x3ab5b1c0] mmco: unref short failure
[h264 @ 0x3ab5b1c0] mmco: unref short failure
Trích xuất từ video 2/4:  42%|████▏     | 160/384 [00:49<00:49,  4.52it/s][h264 @ 0x3ab5b1c0] mmco: unref short failure
Trích xuất từ video 2/4:  52%|█████▏    | 1

In [None]:
save_image_embeddings(image_paths, "image_embeddings.pkl")

In [None]:
query_vi = "Băng tan ở Nam Cực"
results = search_frames_with_saved_embeddings(image_paths, query_vi, topk=10, save_dir="hits", embedding_path="image_embeddings.pkl")