# Import thư viện

In [57]:
import numpy as np
from tqdm.notebook import tqdm
from PIL import Image
import time

In [18]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    force=True,                 # ép ghi đè cấu hình cũ (rất quan trọng trong notebook)
)

logger = logging.getLogger("Embedd Frame")
logger.info("Xin chào")

2025-09-02 09:47:39,862 - Embedd Frame - INFO - Xin chào


In [1]:
!git clone https://github.com/hein-nkhh/unilm.git
%cd unilm/beit3

Cloning into 'unilm'...
remote: Enumerating objects: 11122, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 11122 (delta 31), reused 19 (delta 18), pack-reused 11077 (from 3)[K
Receiving objects: 100% (11122/11122), 75.39 MiB | 29.74 MiB/s, done.
Resolving deltas: 100% (5248/5248), done.
Updating files: 100% (6051/6051), done.
/kaggle/working/unilm/beit3


In [4]:
from IPython.display import clear_output

In [5]:
!pip install -r requirements.txt
clear_output()

In [6]:
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from modeling_finetune import beit3_large_patch16_384_retrieval
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.cuda.amp import autocast
from transformers import XLMRobertaTokenizer
import torch
import json
import cv2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
tokenizer = XLMRobertaTokenizer("/kaggle/input/beit3_base_retrieval/pytorch/default/2/beit3.spm")

# Mô hình beit_3
ckpt = "/kaggle/input/beit3_base_retrieval/pytorch/default/2/beit3_large_patch16_384_coco_retrieval.pth"
model = beit3_large_patch16_384_retrieval(pretrained=False)
state_dict = torch.load(ckpt, map_location=device)
model.load_state_dict(state_dict["model"], strict=False)
model = model.to(device)
model.eval()
clear_output()

transform  = transforms.Compose([
    transforms.Resize((384, 384), interpolation=3), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5])
])

In [72]:
def extract_frames_with_opencv(
    video_path: str, 
    target_height: int = 27, 
    target_width: int = 48, 
    target_fps: float = None,        # thêm tuỳ chọn fps
    show_progressbar: bool = False
):
    """
    Extracts frames from a video using OpenCV and returns a list of PIL Images.
    If target_fps is set, frames will be sampled to match that FPS.
    """
    logger.info(f"Opening video: {video_path}")
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        logger.error(f"Failed to open video: {video_path}")
        raise ValueError(f"Failed to open video: {video_path}")

    video_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Nếu có target_fps thì tính step
    if target_fps is not None and target_fps > 0 and video_fps > 0:
        step = int(round(video_fps / target_fps))
        logger.info(f"Video FPS: {video_fps:.2f}, target FPS: {target_fps}, step: {step}")
    else:
        step = 1
        logger.info(f"Video FPS: {video_fps:.2f}, using all frames")

    frames = []

    progress_bar = tqdm(total=total_frames, desc="Extracting frames", unit="frame") if show_progressbar else None

    frame_idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx % step == 0:   # chỉ lấy frame theo step
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_resized = cv2.resize(frame_rgb, (target_width, target_height))
            img_pil = Image.fromarray(frame_resized)
            frames.append(img_pil)

        frame_idx += 1
        if progress_bar:
            progress_bar.update(1)

    cap.release()
    if progress_bar:
        progress_bar.close()
    logger.info(f"Extracted {len(frames)} frames (from {total_frames})")
    return frames

In [77]:
image_paths = extract_frames_with_opencv('/kaggle/input/aic-sample-test/videos/L21_V001.mp4',
                                         show_progressbar=True)

2025-09-02 10:29:59,651 - Embedd Frame - INFO - Opening video: /kaggle/input/aic-sample-test/videos/L21_V001.mp4
2025-09-02 10:29:59,694 - Embedd Frame - INFO - Video FPS: 30.00, using all frames


Extracting frames:   0%|          | 0/37849 [00:00<?, ?frame/s]

2025-09-02 10:31:33,375 - Embedd Frame - INFO - Extracted 37849 frames (from 37849)


In [78]:
len(image_paths)

37849

In [79]:
# path = "/kaggle/input/aic-small-2024/Keyframes_L21/keyframes/L21_V001"
# image_paths = [os.path.join(path, name) for name in os.listdir(path)]

embeddings = []
ids = []

with torch.no_grad():
    start = time.time()
    for image in tqdm(image_paths, desc="🔄 Extracting image embeddings"):
        # image = Image.open(img_path).convert("RGB")
        image_tensor = transform(image).unsqueeze(0).to(device)

        with autocast(): 
            vision_cls, _ = model(image=image_tensor, only_infer=True)
            vision_norm = F.normalize(vision_cls, p=2, dim=-1)

        embeddings.append(vision_norm.squeeze(0).cpu())   # (D,)
        ids.append(img_path)

        del image_tensor, vision_cls, vision_norm
        torch.cuda.empty_cache()
    end = time.time()
    logger.info(f'Thời gian embedd {end-start}')

image_embeddings = torch.stack(embeddings, dim=0)  # (N,D)
torch.save({"embeddings": image_embeddings, "ids": ids}, "image_embeddings.pt")
print("✅ Saved embeddings:", image_embeddings.shape)

🔄 Extracting image embeddings:   0%|          | 0/37849 [00:00<?, ?it/s]

  with autocast():
2025-09-02 11:27:51,747 - Embedd Frame - INFO - Thời gian embedd 3378.319060564041


✅ Saved embeddings: torch.Size([37849, 1024])


In [80]:
image_embeddings.shape

torch.Size([37849, 1024])

In [83]:
import pickle
with open("/kaggle/working/L21_V001_embeddings.pkl", "wb") as f:
    pickle.dump(image_embeddings, f)

print("✅ Saved embeddings.pkl")

✅ Saved embeddings.pkl


In [84]:
with open("/kaggle/working/L21_V001_embeddings.pkl", "rb") as f:
    loaded_embeddings = pickle.load(f)

print(type(loaded_embeddings))   # torch.Tensor hoặc numpy.ndarray
print(loaded_embeddings.shape)

<class 'torch.Tensor'>
torch.Size([37849, 1024])


In [85]:
loaded_embeddings[0]

tensor([-0.0125, -0.0231,  0.0123,  ...,  0.0028, -0.0030, -0.0435])

In [86]:
image_embeddings[0]

tensor([-0.0125, -0.0231,  0.0123,  ...,  0.0028, -0.0030, -0.0435])