### Extract frames from video

In [1]:
import cv2  # still used to save images out
import os
import numpy as np
from decord import VideoReader
from decord import cpu, gpu


def extract_frames(video_path, frames_dir, overwrite=False, start=-1, end=-1, every=1):
    """
    Extract frames from a video using decord's VideoReader
    :param video_path: path of the video
    :param frames_dir: the directory to save the frames
    :param overwrite: to overwrite frames that already exist?
    :param start: start frame
    :param end: end frame
    :param every: frame spacing
    :return: count of images saved
    """

    video_path = os.path.normpath(video_path)  # make the paths OS (Windows) compatible
    frames_dir = os.path.normpath(frames_dir)  # make the paths OS (Windows) compatible

    video_dir, video_filename = os.path.split(video_path)  # get the video path and filename from the path

    assert os.path.exists(video_path)  # assert the video file exists

    # load the VideoReader
    vr = VideoReader(video_path, ctx=cpu(0))  # can set to cpu or gpu .. ctx=gpu(0)
                     
    if start < 0:  # if start isn't specified lets assume 0
        start = 0
    if end < 0:  # if end isn't specified assume the end of the video
        end = len(vr)

    frames_list = list(range(start, end, every))
    saved_count = 0

    if every > 25 and len(frames_list) < 1000:  # this is faster for every > 25 frames and can fit in memory
        frames = vr.get_batch(frames_list).asnumpy()

        for index, frame in zip(frames_list, frames):  # lets loop through the frames until the end
            save_path = os.path.join(frames_dir, video_filename, "{:010d}.jpg".format(index))  # create the save path
            if not os.path.exists(save_path) or overwrite:  # if it doesn't exist or we want to overwrite anyways
                cv2.imwrite(save_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))  # save the extracted image
                saved_count += 1  # increment our counter by one

    else:  # this is faster for every <25 and consumes small memory
        for index in range(start, end):  # lets loop through the frames until the end
            frame = vr[index]  # read an image from the capture
            
            if index % every == 0:  # if this is a frame we want to write out based on the 'every' argument
                save_path = os.path.join(frames_dir, video_filename, "{:010d}.jpg".format(index))  # create the save path
                if not os.path.exists(save_path) or overwrite:  # if it doesn't exist or we want to overwrite anyways
                    cv2.imwrite(save_path, cv2.cvtColor(frame.asnumpy(), cv2.COLOR_RGB2BGR))  # save the extracted image
                    saved_count += 1  # increment our counter by one

    return saved_count  # and return the count of the images we saved


def video_to_frames(video_path, frames_dir, overwrite=False, every=1):
    """
    Extracts the frames from a video
    :param video_path: path to the video
    :param frames_dir: directory to save the frames
    :param overwrite: overwrite frames if they exist?
    :param every: extract every this many frames
    :return: path to the directory where the frames were saved, or None if fails
    """

    video_path = os.path.normpath(video_path)  # make the paths OS (Windows) compatible
    frames_dir = os.path.normpath(frames_dir)  # make the paths OS (Windows) compatible

    video_dir, video_filename = os.path.split(video_path)  # get the video path and filename from the path

    # make directory to save frames, its a sub dir in the frames_dir with the video name
    os.makedirs(os.path.join(frames_dir, video_filename), exist_ok=True)
    
    print("Extracting frames from {}".format(video_filename))
    
    extract_frames(video_path, frames_dir, every=every)  # let's now extract the frames

    return os.path.join(frames_dir, video_filename)  # when done return the directory containing the frames


In [None]:
# video_path = "../notebook/input.webm"
# output_folder = "unique_frames_output"

# video_to_frames(video_path=video_path, frames_dir=output_folder, overwrite=False, every=5)

Extracting frames from input.webm


'unique_frames_output\\input.webm'

### generate emb

In [3]:
import os
image_folder_path = "../notebook/unique_frames_output/input.webm/"
image_file_path_list = [os.path.join(image_folder_path, x) for x in os.listdir(image_folder_path)]

In [1]:
import os
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hdbscan
import matplotlib.pyplot as plt
from collections import defaultdict

# 1. Load Pretrained MobileNetV2 Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.mobilenet_v2(pretrained=True).features
model.eval().to(device)

# MobileNetV2 Output: Last Conv layer gives (1280,) after GAP
gap = torch.nn.AdaptiveAvgPool2d(1)

# 2. Image Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 3. Batched Feature Extraction
from tqdm import tqdm

def extract_features_batch(image_paths, batch_size=32):
    features = []
    batch = []
    paths_in_batch = []

    for idx, path in enumerate(tqdm(image_paths, desc="Loading Images")):
        try:
            img = Image.open(path).convert('RGB')
            img_tensor = transform(img)
            batch.append(img_tensor)
            paths_in_batch.append(path)
        except Exception as e:
            print(f"Error loading {path}: {e}")

        # When batch is ready or last image
        if len(batch) == batch_size or idx == len(image_paths) - 1:
            batch_tensor = torch.stack(batch).to(device)  # (B, 3, 224, 224)
            with torch.no_grad():
                feat_maps = model(batch_tensor)  # (B, 1280, 7, 7)
                pooled = gap(feat_maps).squeeze(-1).squeeze(-1)  # (B, 1280)
                features.append(pooled.cpu())

            # Reset batch
            batch = []
            paths_in_batch = []

    return torch.cat(features).numpy()  # (N, 1280)

# 4. Load All Image Features
image_folder = "../notebook/unique_frames_output/input.webm/"  # Replace with your folder
image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.lower().endswith(('png', 'jpg', 'jpeg'))]




In [2]:
features = extract_features_batch(image_paths, batch_size=64)  # Tune batch_size for your GPU

Loading Images: 100%|██████████| 2640/2640 [01:22<00:00, 31.99it/s]


In [11]:
# 5. Dimensionality Reduction (Optional but helps clustering)
pca = PCA(n_components=264)
features_pca = pca.fit_transform(features)

# # 6a. Clustering using KMeans
# kmeans = KMeans(n_clusters=5, random_state=42)
# kmeans_labels = kmeans.fit_predict(features_pca)

# 6b. Clustering using HDBSCAN (auto-cluster count)
hdb = hdbscan.HDBSCAN(min_cluster_size=5)
hdb_labels = hdb.fit_predict(features_pca)

# 7. Visualization Function
def show_cluster_images(labels, img_paths, title="Clusters"):
    clusters = defaultdict(list)
    for label, path in zip(labels, img_paths):
        clusters[label].append(path)

    for label, paths in clusters.items():
        print(f"\nCluster {label} - {len(paths)} images")
        fig, axs = plt.subplots(1, min(len(paths), 5), figsize=(15, 5))
        for ax, img_path in zip(axs, paths[:5]):
            img = Image.open(img_path)
            ax.imshow(img)
            ax.axis('off')
        plt.suptitle(f"{title} - Cluster {label}", fontsize=16)
        plt.tight_layout()
        plt.show()

# 8. Show Results
# show_cluster_images(kmeans_labels, image_paths, "KMeans")
# show_cluster_images(hdb_labels, image_paths, "HDBSCAN")



In [18]:
clusering_result = {}
for cluster_label in hdb_labels:
    clusering_result[int(cluster_label)] = []

for cluster_label, path in zip(hdb_labels, image_paths):
    clusering_result[int(cluster_label)].append(path)

In [22]:
clusering_result

{15: ['../notebook/unique_frames_output/input.webm/0000000000.jpg',
  '../notebook/unique_frames_output/input.webm/0000000005.jpg',
  '../notebook/unique_frames_output/input.webm/0000000010.jpg',
  '../notebook/unique_frames_output/input.webm/0000000015.jpg',
  '../notebook/unique_frames_output/input.webm/0000000020.jpg',
  '../notebook/unique_frames_output/input.webm/0000000025.jpg',
  '../notebook/unique_frames_output/input.webm/0000000030.jpg',
  '../notebook/unique_frames_output/input.webm/0000000035.jpg',
  '../notebook/unique_frames_output/input.webm/0000000040.jpg',
  '../notebook/unique_frames_output/input.webm/0000000045.jpg',
  '../notebook/unique_frames_output/input.webm/0000000050.jpg',
  '../notebook/unique_frames_output/input.webm/0000000055.jpg',
  '../notebook/unique_frames_output/input.webm/0000000060.jpg',
  '../notebook/unique_frames_output/input.webm/0000000065.jpg',
  '../notebook/unique_frames_output/input.webm/0000000070.jpg',
  '../notebook/unique_frames_output/

### Emb CLIP

In [24]:
import clip
import torch
from PIL import Image
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model (ViT-B/32 is fast, ViT-L/14 is more accurate)
model, preprocess = clip.load("ViT-B/32", device=device)


100%|███████████████████████████████████████| 338M/338M [03:17<00:00, 1.79MiB/s]


In [25]:
def get_image_embedding(image_path):
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features.cpu().numpy()[0]  # Shape: (512,)


In [26]:
def get_image_embeddings_batch(image_paths, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting CLIP embeddings"):
        batch_paths = image_paths[i:i+batch_size]
        batch_images = [preprocess(Image.open(p).convert("RGB")) for p in batch_paths]
        batch_tensor = torch.stack(batch_images).to(device)
        with torch.no_grad():
            batch_features = model.encode_image(batch_tensor).cpu().numpy()
        all_embeddings.extend(batch_features)
    return all_embeddings  # Shape: (N, 512)


In [27]:
def get_text_embedding(text_query):
    text = clip.tokenize([text_query]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    return text_features.cpu().numpy()[0]


In [28]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_images(text_query, image_embeddings, image_paths, top_k=5):
    text_emb = get_text_embedding(text_query)
    sims = cosine_similarity([text_emb], image_embeddings)[0]
    top_indices = sims.argsort()[-top_k:][::-1]
    return [(image_paths[i], sims[i]) for i in top_indices]


In [39]:
tmp_cluster_result = []

for tmp_label in clusering_result.keys():
    tmp_image_list = clusering_result[tmp_label]
    tmp_cluster_result.append(tmp_image_list[-1])

In [40]:
len(tmp_cluster_result)

58

In [41]:
image_paths = tmp_cluster_result  # Your clustered or full image list
image_embeddings = get_image_embeddings_batch(image_paths)

Extracting CLIP embeddings: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it]


In [44]:
query = "transformer architecture"
results = find_similar_images(query, image_embeddings, image_paths)

for path, score in results:
    print(f"{path} (score: {score:.4f})")

../notebook/unique_frames_output/input.webm/0000004925.jpg (score: 0.2749)
../notebook/unique_frames_output/input.webm/0000001510.jpg (score: 0.2589)
../notebook/unique_frames_output/input.webm/0000012160.jpg (score: 0.2586)
../notebook/unique_frames_output/input.webm/0000002470.jpg (score: 0.2471)
../notebook/unique_frames_output/input.webm/0000001640.jpg (score: 0.2455)
