In [None]:
!pip install -U -q transformers accelerate

In [None]:
%%writefile infer.py
from transformers import AutoModel, AutoTokenizer
import numpy as np
import torch
import torchvision.transforms as T
from torch import autocast, inference_mode
# from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from functools import lru_cache
import cv2
import matplotlib.pyplot as plt
import os
import glob
import csv
from tqdm import tqdm
import torch.multiprocessing as mp

torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")

IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)

@lru_cache(maxsize=None)
def get_target_ratios(min_num, max_num):
    ratios = [(i, j) for n in range(min_num, max_num + 1)
              for i in range(1, n + 1)
              for j in range(1, n + 1)
              if min_num <= i * j <= max_num]
    return sorted(ratios, key=lambda x: x[0] * x[1])

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff or (
            ratio_diff == best_ratio_diff and area > 0.5 * image_size * image_size * ratio[0] * ratio[1]
        ):
            best_ratio_diff = ratio_diff
            best_ratio = ratio
    return best_ratio

def dynamic_preprocess_cv2(image_np, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_height, orig_width = image_np.shape[:2]
    aspect_ratio = orig_width / orig_height

    target_ratios = get_target_ratios(min_num, max_num)
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size
    )

    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    cols = target_width // image_size
    rows = target_height // image_size

    resized = cv2.resize(image_np, (target_width, target_height), interpolation=cv2.INTER_CUBIC)

    processed = [
        resized[r * image_size:(r + 1) * image_size,
                c * image_size:(c + 1) * image_size, :]
        for r in range(rows) for c in range(cols)
    ]

    if use_thumbnail and len(processed) != 1:
        thumb = cv2.resize(image_np, (image_size, image_size), interpolation=cv2.INTER_CUBIC)
        processed.append(thumb)

    return processed

def to_tensor_and_normalize(img_list):
    tensors = []
    for img in img_list:
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        t = torch.from_numpy(img_rgb).permute(2, 0, 1).float() / 255.0
        t = (t - IMAGENET_MEAN) / IMAGENET_STD
        tensors.append(t)
    return torch.stack(tensors)

def load_image(image_file, input_size=448, max_num=12):
    image_np = cv2.imread(image_file)  # BGR
    images = dynamic_preprocess_cv2(image_np, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = to_tensor_and_normalize(images)
    return pixel_values

def workers(rank, all_chunks, batch_size, question, generation_config):
    """Inference worker for one GPU."""
    paths = all_chunks[rank]
    torch.set_grad_enabled(False)
    torch.cuda.set_device(rank)
    device = f"cuda:{rank}"
    print(f"Process {rank} using {device}")

    model = AutoModel.from_pretrained(
        "5CD-AI/Vintern-1B-v3_5",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        use_flash_attn=True
    ).eval().to(device)

    tokenizer = AutoTokenizer.from_pretrained(
        "5CD-AI/Vintern-1B-v3_5",
        trust_remote_code=True
    )

    temp_csv = f"inference_results_{rank}.csv"
    processed_local = set()

    # Load already processed frames for this GPU's temp CSV
    if os.path.exists(temp_csv):
        with open(temp_csv, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)
            for row in reader:
                if row:
                    processed_local.add(row[0])

    # Filter out processed frames from this worker's list
    paths = [p for p in paths if os.path.basename(p) not in processed_local]
    print(f"GPU{rank}: {len(paths)} images left after skipping {len(processed_local)} already processed locally.")

    # Open CSV in append mode
    write_header = not os.path.exists(temp_csv)
    with open(temp_csv, "a", newline="", encoding="utf-8", buffering=1) as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(["frame_name", "response"])

        for i in tqdm(range(0, len(paths), batch_size), desc=f"GPU{rank}", unit="batch"):
            batch_paths = paths[i:i+batch_size]
            batch_pixels = torch.stack([load_image(p, max_num=6) for p in batch_paths])
            batch_pixels = batch_pixels.pin_memory().to(torch.bfloat16).cuda(rank, non_blocking=True)

            with inference_mode(), autocast(device_type="cuda", dtype=torch.bfloat16):
                try:
                    responses = model.chat(tokenizer, batch_pixels, [question] * len(batch_paths), generation_config)
                except Exception:
                    responses = [model.chat(tokenizer, img, question, generation_config) for img in batch_pixels]

            # Write results immediately
            for path, resp in zip(batch_paths, responses):
                frame_name = os.path.basename(path)
                writer.writerow([frame_name, resp])
                f.flush()  # ensure it's written to disk right away



if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)

    root_dir = "/kaggle/input/aic2025/keyframes"
    r0 = pd.read_csv("/kaggle/input/tmp-old-inf/old_inference_results_0.csv")['frame_name']
    r1 = pd.read_csv("/kaggle/input/tmp-old-inf/old_inference_results_1.csv")['frame_name']
    
    partid = 0 # PART here: 0: Dat, 1: Huan, 2: Khoa, 3: Tuan, 4: Phat
    
    output_csv = "inference_results.csv"
    batch_size = 4
    num_gpus = 2

    generation_config = dict(
        max_new_tokens=512,
        do_sample=False,
        num_beams=2,
        repetition_penalty=3.0,
        early_stopping=True
    )
    question = '<image>\nMô tả chi tiết các vật, màu sắc và chữ trong ảnh.'

    image_paths = sorted(glob.glob(f"{root_dir}/**/*.jpg", recursive=True))
    print(f"Found {len(image_paths)} images.")
    
    existed_results = set(r0).intersection(set(r1))

    image_paths = [p for p in image_paths if os.path.splitext(os.path.basename(p))[0] not in existed_results]

    print(f"Remaining {len(image_paths)} images.")
    
    def split_list(lst, n_parts=5):
        k, m = divmod(len(lst), n_parts)  # k = base size, m = remainder
        return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n_parts)]

    image_paths = split_list(image_paths, n_parts=5)[partid]

    print(f"Your part: {len(image_paths)} images.")

    chunks = [image_paths[i::num_gpus] for i in range(num_gpus)]
    mp.spawn(workers, args=(chunks, batch_size, question, generation_config), nprocs=num_gpus)

    # Merge CSV files
    with open(output_csv, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["frame_name", "response"])
        for rank in range(num_gpus):
            with open(f"inference_results_{rank}.csv", "r", encoding="utf-8") as infile:
                next(infile)  # skip header
                for line in infile:
                    outfile.write(line)

In [None]:
!python /kaggle/working/infer.py