In [2]:
!apt-get update -y && apt-get install -y cmake build-essential

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2149 kB]
Get:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
0% [4 Packages store 0 B] [Waiting for headers]                                

Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3532 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.5 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [60.9 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1290 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [5988 kB]
Get:14 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]    
Get:15 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]
Get:16 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 Packages [266 kB]
Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Package

In [3]:
!pip install --index-url https://download.pytorch.org/whl/cu126 torch==2.7.1 torchvision==0.22.1

!pip install transformers==4.30 datasets==4.4.0 tqdm==4.67.1
!pip install opencv-python-headless==4.10.0.82 numpy==1.26.4 scikit-learn==1.3.2 scipy==1.11.4
!pip install pandas Pillow
!pip install -U dlib

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch==2.7.1
  Downloading https://download.pytorch.org/whl/cu126/torch-2.7.1%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision==0.22.1
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.22.1%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting typing-extensions>=4.10.0 (from torch==2.7.1)
  Downloading https://download.pytorch.org/whl/typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sympy>=1.13.3 (from torch==2.7.1)
  Downloading https://download.pytorch.org/whl/sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch==2.7.1)
  Downloading https://download.pytorch.org/whl/cu126/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch==2.7.1)
  Downloading https://download.pytorch.org/whl/cu126/nv

-----

In [None]:
import os, json, csv, cv2, dlib, torch, numpy as np
import torch.nn.functional as F

from PIL import Image
from pathlib import Path
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoFeatureExtractor

from src import DeepfakeModel


MODEL_DIR   = "./model/clip_base"
TEST_DIR    = Path("./data")
SUBMIT_CSV  = Path("submission.csv")

IMAGE_EXTS = {".jpg", ".jpeg", ".png"}
VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv"}

NUM_FRAMES = 12

detector = dlib.get_frontal_face_detector()

def load_processor(model_dir):
    try:
        return AutoImageProcessor.from_pretrained(model_dir)
    except:
        return AutoFeatureExtractor.from_pretrained(model_dir)

def get_boundingbox(face, width, height):
    x1, y1, x2, y2 = face.left(), face.top(), face.right(), face.bottom()
    size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
    cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
    x1 = max(int(cx - size_bb // 2), 0)
    y1 = max(int(cy - size_bb // 2), 0)
    size_bb = min(width - x1, size_bb)
    size_bb = min(height - y1, size_bb)
    return x1, y1, size_bb

def detect_and_crop_face(image: Image.Image, target_size=(224, 224), resize_for_detection=640):
    if image.mode != "RGB":
        image = image.convert("RGB")

    np_img = np.array(image)
    H, W, _ = np_img.shape

    if W > resize_for_detection:
        scale = resize_for_detection / float(W)
        resized = cv2.resize(np_img, (resize_for_detection, int(H * scale)), interpolation=cv2.INTER_AREA)
    else:
        scale, resized = 1.0, np_img

    faces = detector(resized, 1)
    if not faces:
        return None

    face = max(faces, key=lambda r: r.width() * r.height())
    face_scaled = dlib.rectangle(
        int(face.left() / scale),
        int(face.top() / scale),
        int(face.right() / scale),
        int(face.bottom() / scale),
    )
    x, y, s = get_boundingbox(face_scaled, W, H)
    crop = np_img[y:y + s, x:x + s]
    if crop.size == 0:
        return None

    return Image.fromarray(crop).resize(target_size, Image.BICUBIC)

def process_single_file(path: Path):
    faces = []
    ext = path.suffix.lower()

    try:
        if ext in IMAGE_EXTS:
            img = Image.open(path)
            f = detect_and_crop_face(img)
            if f:
                faces = [f] * NUM_FRAMES

        elif ext in VIDEO_EXTS:
            cap = cv2.VideoCapture(str(path))
            total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
            if total > 0:
                idxs = np.linspace(0, max(0, total - 1), NUM_FRAMES, dtype=int)
                for i in idxs:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
                    ok, frame = cap.read()
                    if not ok or frame is None:
                        continue
                    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                    f = detect_and_crop_face(img)
                    if f:
                        faces.append(f)
            cap.release()

    except Exception as e:
        return path.name, [], str(e)

    return path.name, faces, None

def load_custom_model(model_dir: str, device: torch.device):
    cfg_path = os.path.join(model_dir, "custom_config.json")
    with open(cfg_path, "r") as f:
        cfg = json.load(f)

    clip_rel      = cfg.get("clip_model_name", "clip_backbone")
    clip_name     = os.path.join(model_dir, clip_rel)
    num_frames    = int(cfg.get("num_frames", 12))
    num_classes   = int(cfg["num_classes"])
    id2label      = cfg.get("id2label")
    label2id      = cfg.get("label2id")

    model = DeepfakeModel(
        clip_model_name=clip_name,
        dtype="fp32",
        freeze_backbone=True,
        unfreeze_last_n_blocks=0,
        num_classes=num_classes,
        d2st_num_frames=num_frames,
        d2st_scale=0.25,
        hidden_mult=2,
        temporal_pool="mean",
        id2label=id2label,
        label2id=label2id,
    )

    state_path = os.path.join(model_dir, "model.bin")
    state = torch.load(state_path, map_location="cpu")

    # Shape이 안 맞는 레이어는 스킵
    keys_to_remove = []
    for key in state.keys():
        if key in model.state_dict():
            if state[key].shape != model.state_dict()[key].shape:
                print(f"[skip] {key}: checkpoint {state[key].shape} != model {model.state_dict()[key].shape}")
                keys_to_remove.append(key)
    
    for key in keys_to_remove:
        del state[key]

    incompatible = model.load_state_dict(state, strict=False)
    print(f"- Missing keys: {len(incompatible.missing_keys)}")
    print(f"- Unexpected keys: {len(incompatible.unexpected_keys)}")

    model.to(device).eval()
    return model, id2label, num_frames

if __name__ == "__main__":
    torch.backends.cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    processor = load_processor(MODEL_DIR)
    model, id2label, num_frames = load_custom_model(MODEL_DIR, device)
    NUM_FRAMES = num_frames

    print("model loaded:", next(model.parameters()).device)
    print("NUM_FRAMES =", NUM_FRAMES)

    files = [p for p in sorted(TEST_DIR.iterdir()) if p.is_file()]
    print("Test files:", len(files))

    # CSV init
    with open(SUBMIT_CSV, "w", newline="") as f:
        csv.writer(f).writerow(["filename", "label"])

    results = {}

    for p in tqdm(files, desc="Processing"):
        fname, face_images, err = process_single_file(p)

        if err:
            print(f"[WARN] {fname}: {err}")

        if not face_images:
            results[fname] = 0
            continue

        # 12프레임이 안 되면 마지막 프레임으로 패딩
        while len(face_images) < NUM_FRAMES:
            face_images.append(face_images[-1])
        
        # 정확히 NUM_FRAMES만 사용
        face_images = face_images[:NUM_FRAMES]

        with torch.no_grad():
            # 전체 12프레임을 한 번에 처리
            enc = processor(images=face_images, return_tensors="pt")
            pixel_values = enc["pixel_values"].to(device)  # (12, 3, 224, 224)
            
            # 비디오 형태로 reshape: (1, 12, 3, 224, 224)
            pixel_values = pixel_values.unsqueeze(0)
            
            logits = model(pixel_values=pixel_values)  # (1, num_classes)
            probs = F.softmax(logits, dim=1)
            pred = int(torch.argmax(probs, dim=1).item())
            results[fname] = pred

    print("Writing results...")
    with open(SUBMIT_CSV, "a", newline="") as f:
        w = csv.writer(f)
        for p in files:
            w.writerow([p.name, int(results.get(p.name, 0))])

    print("Done.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[patch] adapter.s_attn.conv_offset.0.weight: temporal kernel 4 → 1
- Missing keys: 0
- Unexpected keys: 3
model loaded: cuda:0
NUM_FRAMES = 1
Test files: 13


Processing: 100%|██████████| 13/13 [00:03<00:00,  3.75it/s]


Writing results...
Done.
