# Cài đặt thư viện

In [1]:
!pip install open-clip-torch

Collecting open-clip-torch
  Downloading open_clip_torch-3.1.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm>=1.0.17 (from open-clip-torch)
  Downloading timm-1.0.19-py3-none-any.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12=

# Import thư viện

In [2]:
# # -*- coding: utf-8 -*-
import os, cv2, torch, numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display
import pickle
import numpy as np
import logging

logger = logging.getLogger() 

# Hàm embedd

In [3]:
# ======= CHỌN BACKEND =======
# "openclip" (như bạn đang dùng) hoặc "siglip2"
BACKEND = os.environ.get("MM_BACKEND", "openclip")  # "openclip" | "siglip2"

# ======= LOAD MODEL =======
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if BACKEND == "openclip":
    import open_clip
    OC_MODEL = "ViT-g-14"
    OC_PRETRAINED = "laion2b_s34b_b88k"
    model, _, preprocess = open_clip.create_model_and_transforms(
        OC_MODEL, pretrained=OC_PRETRAINED, device=DEVICE
    )
    tokenizer = open_clip.get_tokenizer(OC_MODEL)
    model.eval()

elif BACKEND == "siglip2":
    # pip install -U transformers accelerate bitsandbytes
    from transformers import AutoProcessor, AutoModel
    CKPT = os.environ.get("SIGLIP2_CKPT", "google/siglip2-giant-opt-patch16-384")
    processor = AutoProcessor.from_pretrained(CKPT)
    model = AutoModel.from_pretrained(CKPT, device_map="auto").eval()
else:
    raise ValueError("BACKEND phải là 'openclip' hoặc 'siglip2'.")

def extract_frames_with_opencv(video_path: str, target_height: int = 27, target_width: int = 48, show_progressbar: bool = False):
    """
    Extracts frames from a video using OpenCV with optional CUDA support and progress tracking.
    """
    logger.info(f"Opening video: {video_path}")
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        logger.error(f"Failed to open video: {video_path}")
        raise ValueError(f"Failed to open video: {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []

    # Initialize progress bar
    progress_bar = tqdm(total=total_frames, desc="Extracting frames", unit="frame") if show_progressbar else None

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Resize frame
        frame_resized = cv2.resize(frame_rgb, (target_width, target_height))
        frames.append(frame_resized)
        if progress_bar:
            progress_bar.update(1)

    cap.release()
    if progress_bar:
        progress_bar.close()
    logger.info(f"Extracted {len(frames)} frames")
    return np.array(frames)

# ======= VIDEO FRAME SAMPLER =======
def sample_frames(video_path, every_ms=500, max_frames=None,
                  time_window="first_half", start_sec=None, end_sec=None):
    """
    time_window: "first_half" | "second_half" | None
    Hoặc chỉ định start_sec/end_sec (ưu tiên hơn time_window).
    """
    cap = cv2.VideoCapture(video_path)
    assert cap.isOpened(), f"Không mở được video: {video_path}"
    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
    n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    dur = (n_frames / fps) if n_frames > 0 else None

    # Tính khoảng đọc
    if start_sec is None and end_sec is None and dur is not None:
        if time_window == "first_half":
            start_sec, end_sec = 0.0, dur * 0.5
        elif time_window == "second_half":
            start_sec, end_sec = dur * 0.5, dur
        else:
            start_sec, end_sec = 0.0, dur  # toàn bộ
    elif start_sec is None and end_sec is None:
        start_sec, end_sec = 0.0, float("inf")

    # Seek tới frame bắt đầu (nếu biết)
    if dur is not None:
        start_frame = int(start_sec * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    step = max(1, int(fps * (every_ms / 1000.0)))
    frames, times = [], []
    i = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

    with tqdm(desc="Đọc video (giới hạn)", unit="f") as pbar:
        while True:
            # Kiểm tra dừng theo end_sec (nếu POS_MSEC khả dụng)
            pos_msec = cap.get(cv2.CAP_PROP_POS_MSEC)
            if pos_msec > 0:
                cur_sec = pos_msec / 1000.0
                if cur_sec > end_sec:
                    break

            ret, fr = cap.read()
            if not ret:
                break

            if i % step == 0:
                rgb = cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)
                frames.append(rgb)

                # Lấy thời gian hiện tại sau khi read; fallback nếu =0
                pos_msec2 = cap.get(cv2.CAP_PROP_POS_MSEC)
                if pos_msec2 and pos_msec2 > 0:
                    t_sec = pos_msec2 / 1000.0
                else:
                    # Fallback: ước lượng theo frame index
                    t_sec = i / fps
                times.append(t_sec)

                if max_frames and len(frames) >= max_frames:
                    break

            i += 1
            pbar.update(1)

    cap.release()
    return frames, times

# ======= EMBEDDING FUNCS (tuỳ BACKEND) =======
@torch.no_grad()
def embed_images(frames, batch_size=64):
    """
    frames: list[np.ndarray(H,W,3) in RGB]
    return: np.ndarray [N, D] đã L2-norm
    """
    vecs = []
    total = len(frames)

    if BACKEND == "openclip":
        for i in tqdm(range(0, total, batch_size), desc="Embedding (OpenCLIP)", unit="batch"):
            batch = [preprocess(Image.fromarray(x)) for x in frames[i:i+batch_size]]
            batch = torch.stack(batch).to(DEVICE)
            feats = model.encode_image(batch)
            feats = feats / feats.norm(dim=-1, keepdim=True)
            vecs.append(feats.float().cpu().numpy())

    elif BACKEND == "siglip2":
        from PIL import Image as _Image
        pil_frames = [_Image.fromarray(x) for x in frames]
        for i in tqdm(range(0, total, batch_size), desc="Embedding (SigLIP2)", unit="batch"):
            batch = pil_frames[i:i+batch_size]
            inputs = processor(images=batch, return_tensors="pt").to(model.device)
            img_feats = model.get_image_features(**inputs)  # (B, D)
            img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)
            vecs.append(img_feats.float().cpu().numpy())

    return np.vstack(vecs)

open_clip_model.safetensors:   0%|          | 0.00/5.47G [00:00<?, ?B/s]

# Hàm thực thi 

In [4]:
# ======= DEMO (tuỳ chọn) =======
if __name__ == "__main__":
    video_path = "/kaggle/input/aic-sample-test/L21_V002.mp4"

    frames = extract_frames_with_opencv(video_path, show_progressbar=True)
    embeddings = embed_images(frames)

Extracting frames: 100%|██████████| 31720/31720 [01:12<00:00, 438.91frame/s]
Embedding (OpenCLIP): 100%|██████████| 496/496 [48:48<00:00,  5.90s/batch]


# Lưu embedding

In [5]:
import os


# Lấy tên file cuối cùng (có cả .mp4)
filename_with_ext = os.path.basename(video_path)

# Loại bỏ phần đuôi .mp4 để chỉ lấy L21_V001
filename = os.path.splitext(filename_with_ext)[0]

print(filename)  # Kết quả: L21_V001
save_path = '/kaggle/working/' + filename + '_features.pkl'
print(save_path)
with open(save_path, "wb") as f:
    pickle.dump(embeddings, f)

L21_V002
/kaggle/working/L21_V002_features.pkl
