In [1]:
# !pip install open_clip_torch -q

In [2]:
import os
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import open_clip
import glob
from collections import defaultdict
import shutil
from cfg import KEYFRAME_DIR, FEATURE_DIR, ZIP_PATH

# --- Configuration ---
base_dir = KEYFRAME_DIR # contains batch of videos frames: V1/V1_0001, V1/V1_0002, ...
output_dir = FEATURE_DIR # Save 1 .npy per image
zip_path = ZIP_PATH # one single .zip file

os.makedirs(output_dir, exist_ok=True)

# --- Load OpenCLIP ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='ViT-H-14-quickgelu',
    pretrained='dfn5b',
    device=device
)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-31): 32 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=Tru

In [3]:
videos = glob.glob(os.path.join(base_dir,'*','*','*.jpg'))
if not videos:
    print("[ERROR] No videos found.")

print(f"[INFO] Found {len(videos)} frames to process.")

videos = sorted(videos)

[INFO] Found 17961 frames to process.


In [4]:
# check output dir and remove same name
existed = glob.glob(os.path.join(output_dir,'*','*','*.npy'))


keyframes_map = {os.path.splitext(os.path.basename(f))[0]: f for f in videos}

# Basenames from second list
features_set = {os.path.splitext(os.path.basename(f))[0] for f in existed}

filtered_first = [path for base, path in keyframes_map.items() if base not in features_set]

videos = sorted(filtered_first)

print(f"[INFO] Remaining {len(videos)} frames to process (removed existed features).")

[INFO] Remaining 15489 frames to process (removed existed features).


In [5]:
batch_size=4

folder_to_images = defaultdict(list)
for img_path in videos:
    folder_to_images[os.path.dirname(img_path)].append(img_path)

ii = 0
for folder_path, image_files in folder_to_images.items():
    print(f"{ii+1}/{len(folder_to_images)}")
    # Create matching output subfolder
    rel_path = os.path.relpath(folder_path, base_dir)
    save_folder = os.path.join(output_dir, rel_path)
    os.makedirs(save_folder, exist_ok=True)

    # Sort image paths for consistent processing
    image_files = sorted(image_files)
    # Process in batches
    for i in tqdm(range(0, len(image_files), batch_size), desc=f"Processing {rel_path}"):
        batch_files = image_files[i:i + batch_size]
        batch_images = []
        valid_files = []

        for img_path in batch_files:
            try:
                image = Image.open(img_path).convert('RGB')
                image_tensor = preprocess(image)
                batch_images.append(image_tensor)
                valid_files.append(img_path)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

        if not batch_images:
            continue

        # Encode features
        input_tensor = torch.stack(batch_images).to(device)
        with torch.no_grad():
            features = model.encode_image(input_tensor)
            features = features / features.norm(dim=-1, keepdim=True)

        # Save each feature with same relative path
        for img_path, feat in zip(valid_files, features):
            rel_img_path = os.path.relpath(img_path, base_dir)
            save_path = os.path.join(output_dir, os.path.splitext(rel_img_path)[0] + '.npy')
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            np.save(save_path, feat.cpu().numpy())
        
    ii += 1

1/130


Processing L21/L21_V010: 100%|██████████| 57/57 [04:54<00:00,  5.17s/it]


2/130


Processing L21/L21_V011: 100%|██████████| 71/71 [05:50<00:00,  4.93s/it]


3/130


Processing L21/L21_V012: 100%|██████████| 64/64 [05:19<00:00,  4.99s/it]


4/130


Processing L21/L21_V013: 100%|██████████| 82/82 [06:40<00:00,  4.89s/it]


5/130


Processing L21/L21_V014: 100%|██████████| 80/80 [06:23<00:00,  4.80s/it]


6/130


Processing L21/L21_V015: 100%|██████████| 99/99 [07:57<00:00,  4.82s/it]


7/130


Processing L21/L21_V016: 100%|██████████| 76/76 [07:11<00:00,  5.68s/it]


8/130


Processing L21/L21_V017: 100%|██████████| 57/57 [05:20<00:00,  5.62s/it]


9/130


Processing L21/L21_V018: 100%|██████████| 89/89 [09:52<00:00,  6.66s/it]


10/130


Processing L21/L21_V019: 100%|██████████| 64/64 [05:45<00:00,  5.40s/it]


11/130


Processing L21/L21_V021: 100%|██████████| 64/64 [05:44<00:00,  5.39s/it]


12/130


Processing L21/L21_V022: 100%|██████████| 51/51 [04:09<00:00,  4.90s/it]


13/130


Processing L21/L21_V023: 100%|██████████| 78/78 [06:07<00:00,  4.71s/it]


14/130


Processing L21/L21_V024: 100%|██████████| 83/83 [06:17<00:00,  4.55s/it]


15/130


Processing L21/L21_V025: 100%|██████████| 78/78 [05:53<00:00,  4.53s/it]


16/130


Processing L21/L21_V026: 100%|██████████| 91/91 [06:54<00:00,  4.55s/it]


17/130


Processing L21/L21_V027: 100%|██████████| 83/83 [06:15<00:00,  4.53s/it]


18/130


Processing L21/L21_V028: 100%|██████████| 65/65 [04:55<00:00,  4.54s/it]


19/130


Processing L21/L21_V029: 100%|██████████| 75/75 [05:41<00:00,  4.55s/it]


20/130


Processing L21/L21_V030: 100%|██████████| 72/72 [05:27<00:00,  4.54s/it]


21/130


Processing L21/L21_V031: 100%|██████████| 81/81 [06:09<00:00,  4.57s/it]


22/130


Processing L23/L23_V001: 100%|██████████| 16/16 [01:08<00:00,  4.30s/it]


23/130


Processing L23/L23_V002: 100%|██████████| 12/12 [00:54<00:00,  4.53s/it]


24/130


Processing L23/L23_V003: 100%|██████████| 14/14 [01:03<00:00,  4.56s/it]


25/130


Processing L23/L23_V004: 100%|██████████| 11/11 [00:49<00:00,  4.54s/it]


26/130


Processing L23/L23_V005: 100%|██████████| 7/7 [00:28<00:00,  4.09s/it]


27/130


Processing L23/L23_V006: 100%|██████████| 13/13 [00:58<00:00,  4.50s/it]


28/130


Processing L23/L23_V007: 100%|██████████| 12/12 [00:52<00:00,  4.41s/it]


29/130


Processing L23/L23_V008: 100%|██████████| 21/21 [01:32<00:00,  4.40s/it]


30/130


Processing L23/L23_V009: 100%|██████████| 8/8 [00:36<00:00,  4.59s/it]


31/130


Processing L23/L23_V010: 100%|██████████| 15/15 [01:10<00:00,  4.70s/it]


32/130


Processing L23/L23_V011: 100%|██████████| 21/21 [01:41<00:00,  4.82s/it]


33/130


Processing L23/L23_V012: 100%|██████████| 9/9 [00:37<00:00,  4.14s/it]


34/130


Processing L23/L23_V013: 100%|██████████| 18/18 [01:26<00:00,  4.81s/it]


35/130


Processing L23/L23_V014: 100%|██████████| 11/11 [00:53<00:00,  4.90s/it]


36/130


Processing L23/L23_V015: 100%|██████████| 7/7 [00:32<00:00,  4.66s/it]


37/130


Processing L23/L23_V016: 100%|██████████| 7/7 [00:32<00:00,  4.65s/it]


38/130


Processing L23/L23_V017: 100%|██████████| 11/11 [00:48<00:00,  4.41s/it]


39/130


Processing L23/L23_V018: 100%|██████████| 10/10 [00:48<00:00,  4.81s/it]


40/130


Processing L23/L23_V019: 100%|██████████| 11/11 [00:51<00:00,  4.70s/it]


41/130


Processing L23/L23_V020: 100%|██████████| 15/15 [01:10<00:00,  4.72s/it]


42/130


Processing L23/L23_V021: 100%|██████████| 20/20 [01:33<00:00,  4.67s/it]


43/130


Processing L23/L23_V022: 100%|██████████| 8/8 [00:35<00:00,  4.38s/it]


44/130


Processing L23/L23_V023: 100%|██████████| 14/14 [01:03<00:00,  4.52s/it]


45/130


Processing L23/L23_V024: 100%|██████████| 9/9 [00:37<00:00,  4.21s/it]


46/130


Processing L23/L23_V025: 100%|██████████| 34/34 [02:35<00:00,  4.56s/it]


47/130


Processing L24/L24_V002: 100%|██████████| 22/22 [01:38<00:00,  4.46s/it]


48/130


Processing L24/L24_V003: 100%|██████████| 24/24 [01:45<00:00,  4.40s/it]


49/130


Processing L24/L24_V004: 100%|██████████| 27/27 [02:02<00:00,  4.53s/it]


50/130


Processing L24/L24_V006: 100%|██████████| 12/12 [00:50<00:00,  4.24s/it]


51/130


Processing L24/L24_V007: 100%|██████████| 13/13 [00:55<00:00,  4.25s/it]


52/130


Processing L24/L24_V008: 100%|██████████| 13/13 [00:59<00:00,  4.58s/it]


53/130


Processing L24/L24_V009: 100%|██████████| 14/14 [00:59<00:00,  4.28s/it]


54/130


Processing L24/L24_V011: 100%|██████████| 19/19 [01:23<00:00,  4.37s/it]


55/130


Processing L24/L24_V012: 100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


56/130


Processing L24/L24_V013: 100%|██████████| 13/13 [00:57<00:00,  4.46s/it]


57/130


Processing L24/L24_V014: 100%|██████████| 14/14 [01:00<00:00,  4.29s/it]


58/130


Processing L24/L24_V015: 100%|██████████| 19/19 [01:22<00:00,  4.36s/it]


59/130


Processing L24/L24_V016: 100%|██████████| 13/13 [00:59<00:00,  4.59s/it]


60/130


Processing L24/L24_V017: 100%|██████████| 10/10 [00:44<00:00,  4.43s/it]


61/130


Processing L24/L24_V018: 100%|██████████| 18/18 [01:22<00:00,  4.58s/it]


62/130


Processing L24/L24_V019: 100%|██████████| 13/13 [00:55<00:00,  4.25s/it]


63/130


Processing L24/L24_V020: 100%|██████████| 13/13 [00:56<00:00,  4.37s/it]


64/130


Processing L24/L24_V021: 100%|██████████| 14/14 [01:02<00:00,  4.47s/it]


65/130


Processing L24/L24_V022: 100%|██████████| 14/14 [01:04<00:00,  4.59s/it]


66/130


Processing L24/L24_V023: 100%|██████████| 15/15 [01:05<00:00,  4.39s/it]


67/130


Processing L24/L24_V024: 100%|██████████| 14/14 [01:02<00:00,  4.47s/it]


68/130


Processing L24/L24_V025: 100%|██████████| 13/13 [00:59<00:00,  4.59s/it]


69/130


Processing L24/L24_V026: 100%|██████████| 16/16 [01:13<00:00,  4.62s/it]


70/130


Processing L24/L24_V027: 100%|██████████| 21/21 [01:32<00:00,  4.38s/it]


71/130


Processing L24/L24_V028: 100%|██████████| 24/24 [01:45<00:00,  4.40s/it]


72/130


Processing L24/L24_V029: 100%|██████████| 23/23 [01:45<00:00,  4.59s/it]


73/130


Processing L24/L24_V030: 100%|██████████| 25/25 [01:54<00:00,  4.58s/it]


74/130


Processing L24/L24_V031: 100%|██████████| 23/23 [01:43<00:00,  4.51s/it]


75/130


Processing L24/L24_V033: 100%|██████████| 19/19 [01:22<00:00,  4.37s/it]


76/130


Processing L24/L24_V035: 100%|██████████| 17/17 [01:18<00:00,  4.60s/it]


77/130


Processing L24/L24_V036: 100%|██████████| 9/9 [00:37<00:00,  4.11s/it]


78/130


Processing L24/L24_V037: 100%|██████████| 17/17 [01:17<00:00,  4.58s/it]


79/130


Processing L24/L24_V039: 100%|██████████| 17/17 [01:17<00:00,  4.58s/it]


80/130


Processing L24/L24_V041: 100%|██████████| 3/3 [00:12<00:00,  4.05s/it]


81/130


Processing L24/L24_V043: 100%|██████████| 21/21 [01:36<00:00,  4.59s/it]


82/130


Processing L24/L24_V044: 100%|██████████| 4/4 [00:18<00:00,  4.58s/it]


83/130


Processing L24/L24_V045: 100%|██████████| 4/4 [00:16<00:00,  4.19s/it]


84/130


Processing L25/L25_V001: 100%|██████████| 24/24 [01:47<00:00,  4.46s/it]


85/130


Processing L25/L25_V002: 100%|██████████| 26/26 [01:56<00:00,  4.48s/it]


86/130


Processing L25/L25_V003: 100%|██████████| 22/22 [01:40<00:00,  4.56s/it]


87/130


Processing L25/L25_V004: 100%|██████████| 27/27 [02:01<00:00,  4.50s/it]


88/130


Processing L25/L25_V005: 100%|██████████| 139/139 [10:33<00:00,  4.56s/it]


89/130


Processing L25/L25_V006: 100%|██████████| 22/22 [01:43<00:00,  4.70s/it]


90/130


Processing L25/L25_V007: 100%|██████████| 28/28 [02:10<00:00,  4.64s/it]


91/130


Processing L25/L25_V008: 100%|██████████| 26/26 [01:59<00:00,  4.59s/it]


92/130


Processing L25/L25_V009: 100%|██████████| 22/22 [01:40<00:00,  4.57s/it]


93/130


Processing L25/L25_V010: 100%|██████████| 27/27 [02:06<00:00,  4.68s/it]


94/130


Processing L25/L25_V011: 100%|██████████| 25/25 [02:05<00:00,  5.00s/it]


95/130


Processing L25/L25_V012: 100%|██████████| 22/22 [01:43<00:00,  4.70s/it]


96/130


Processing L25/L25_V013: 100%|██████████| 38/38 [02:55<00:00,  4.63s/it]


97/130


Processing L25/L25_V014: 100%|██████████| 43/43 [03:22<00:00,  4.71s/it]


98/130


Processing L25/L25_V015: 100%|██████████| 22/22 [01:41<00:00,  4.61s/it]


99/130


Processing L25/L25_V016: 100%|██████████| 24/24 [01:51<00:00,  4.64s/it]


100/130


Processing L25/L25_V017: 100%|██████████| 25/25 [01:56<00:00,  4.65s/it]


101/130


Processing L25/L25_V018: 100%|██████████| 27/27 [02:04<00:00,  4.61s/it]


102/130


Processing L25/L25_V019: 100%|██████████| 24/24 [01:48<00:00,  4.52s/it]


103/130


Processing L25/L25_V020: 100%|██████████| 23/23 [01:42<00:00,  4.45s/it]


104/130


Processing L25/L25_V021: 100%|██████████| 42/42 [03:10<00:00,  4.54s/it]


105/130


Processing L25/L25_V022: 100%|██████████| 61/61 [04:36<00:00,  4.53s/it]


106/130


Processing L25/L25_V023: 100%|██████████| 22/22 [01:38<00:00,  4.49s/it]


107/130


Processing L25/L25_V024: 100%|██████████| 27/27 [02:00<00:00,  4.45s/it]


108/130


Processing L25/L25_V025: 100%|██████████| 30/30 [02:15<00:00,  4.50s/it]


109/130


Processing L25/L25_V026: 100%|██████████| 23/23 [01:41<00:00,  4.43s/it]


110/130


Processing L25/L25_V027: 100%|██████████| 23/23 [01:44<00:00,  4.55s/it]


111/130


Processing L25/L25_V028: 100%|██████████| 26/26 [01:56<00:00,  4.47s/it]


112/130


Processing L25/L25_V029: 100%|██████████| 22/22 [01:44<00:00,  4.74s/it]


113/130


Processing L25/L25_V030: 100%|██████████| 27/27 [02:02<00:00,  4.54s/it]


114/130


Processing L25/L25_V031: 100%|██████████| 63/63 [04:54<00:00,  4.67s/it]


115/130


Processing L25/L25_V032: 100%|██████████| 23/23 [01:43<00:00,  4.51s/it]


116/130


Processing L25/L25_V033: 100%|██████████| 27/27 [02:02<00:00,  4.53s/it]


117/130


Processing L25/L25_V034: 100%|██████████| 45/45 [03:25<00:00,  4.57s/it]


118/130


Processing L25/L25_V035: 100%|██████████| 22/22 [01:37<00:00,  4.43s/it]


119/130


Processing L25/L25_V036: 100%|██████████| 35/35 [02:35<00:00,  4.45s/it]


120/130


Processing L25/L25_V037: 100%|██████████| 28/28 [02:03<00:00,  4.41s/it]


121/130


Processing L25/L25_V038: 100%|██████████| 23/23 [01:40<00:00,  4.38s/it]


122/130


Processing L25/L25_V039: 100%|██████████| 35/35 [02:39<00:00,  4.57s/it]


123/130


Processing L25/L25_V040: 100%|██████████| 35/35 [02:36<00:00,  4.48s/it]


124/130


Processing L25/L25_V041: 100%|██████████| 22/22 [01:40<00:00,  4.56s/it]


125/130


Processing L25/L25_V042: 100%|██████████| 28/28 [02:03<00:00,  4.41s/it]


126/130


Processing L25/L25_V043: 100%|██████████| 27/27 [01:58<00:00,  4.40s/it]


127/130


Processing L25/L25_V044: 100%|██████████| 23/23 [01:48<00:00,  4.71s/it]


128/130


Processing L25/L25_V045: 100%|██████████| 27/27 [02:11<00:00,  4.87s/it]


129/130


Processing L25/L25_V046: 100%|██████████| 30/30 [02:29<00:00,  4.98s/it]


130/130


Processing L25/L25_V047: 100%|██████████| 22/22 [01:48<00:00,  4.95s/it]


In [6]:
shutil.make_archive(zip_path, 'zip', output_dir)
print(f"Zipped features saved to {zip_path}.zip")

Zipped features saved to /mnt/d/AI Challenge/Data/features/features.zip.zip
