In [1]:
!cd /content/drive/MyDrive/KeyFrameExtraction

In [2]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
import os
import cv2
import numpy as np
import torch
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import cosine_similarity
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm


# Import TransNetV2 model
!pip install git+https://github.com/soCzech/TransNetV2.git
import sys
sys.path.append('TransNetV2')
from transnetv2 import TransNetV2

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Initialize the TransNetV2 model for shot boundary detection
transnet_model = TransNetV2()

def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_ids = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    for i in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break
        # Convert frame to RGB format
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
        frame_ids.append(i)

    cap.release()
    return frames, frame_ids

def detect_shots(video_path):
    video_frames, single_frame_predictions, all_frame_predictions = transnet_model.predict_video(video_path)
    single_frame_predictions, all_frame_predictions = transnet_model.predict_frames(video_frames)
    shot_boundaries = transnet_model.predictions_to_scenes(single_frame_predictions)
    return shot_boundaries

def extract_features(frames, batch_size=32):
    features = []
    for i in tqdm(range(0, len(frames), batch_size)):
        batch_frames = frames[i:i + batch_size]
        inputs = processor(images=batch_frames, return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            batch_features = model.get_image_features(**inputs)
        features.append(batch_features.cpu().numpy())
        torch.cuda.empty_cache()  # Clear GPU memory
    return np.concatenate(features, axis=0)

def estimate_eps(features, k=10):
    # Use k-nearest neighbors to estimate the eps parameter
    if len(features) < k:
        k = len(features)
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(features)
    distances, indices = neighbors_fit.kneighbors(features)
    distances = np.sort(distances, axis=0)
    distances = distances[:, 1]
    eps = np.mean(distances)
    return eps

def dbscan_clustering(features, eps, min_samples=4):
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(features)
    labels = db.labels_
    return labels

def select_keyframes(frames, frame_ids, features, labels):
    keyframes = []
    keyframe_features = []
    keyframe_indices = []
    unique_labels = set(labels)

    for label in unique_labels:
        #if label == -1 and len(unique_labels)>1:
        if label == -1 :
            continue  # Skip noise points
        cluster_indices = np.where(labels == label)[0]
        if len(cluster_indices) == 0:
            continue  # Skip empty clusters
        cluster_features = features[cluster_indices]
        centroid = cluster_features.mean(axis=0)
        closest_index, _ = pairwise_distances_argmin_min([centroid], cluster_features)
        keyframe_index = cluster_indices[closest_index[0]]
        keyframes.append(frames[keyframe_index])
        keyframe_features.append(features[keyframe_index])
        keyframe_indices.append(frame_ids[keyframe_index])

    return keyframes, keyframe_features, keyframe_indices

def calculate_histogram(frame):
    hsv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv_frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    return hist.flatten()

def filter_redundant_keyframes(keyframes, keyframe_features, keyframe_indices, similarity_threshold=0.7):
    # Step 1: Calculate color histograms for each candidate keyframe
    histograms = [calculate_histogram(frame) for frame in keyframes]

    # Step 2: Remove uninformative frames
    informative_keyframes = []
    informative_histograms = []
    informative_indices = []
    for i, hist in enumerate(histograms):
        if np.count_nonzero(hist) >= 8:
            informative_keyframes.append(keyframes[i])
            informative_histograms.append(hist)
            informative_indices.append(keyframe_indices[i])

    # Step 3: Build similarity matrix
    num_keyframes = len(informative_keyframes)
    print("num_keyframe: ")
    print(f"{num_keyframes}")
    if num_keyframes == 0:
        return [], []

    similarity_matrix = np.zeros((num_keyframes, num_keyframes))
    for i in range(num_keyframes):
        for j in range(i + 1, num_keyframes):
            similarity_matrix[i, j] = cosine_similarity([informative_histograms[i]], [informative_histograms[j]])[0, 0]

    # Step 4: Iterative redundancy removal
    while True:
        if similarity_matrix.size == 0:
            break
        i, j = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
        if similarity_matrix[i, j] < similarity_threshold:
            break
        # Remove the j-th frame
        informative_keyframes.pop(j)
        informative_histograms.pop(j)
        informative_indices.pop(j)
        similarity_matrix = np.delete(similarity_matrix, j, axis=0)
        similarity_matrix = np.delete(similarity_matrix, j, axis=1)

    return informative_keyframes, informative_indices

def save_keyframes(keyframes, keyframe_indices, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for idx, (frame, frame_index) in enumerate(zip(keyframes, keyframe_indices)):
        # Convert frame back to BGR format before saving
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        cv2.imwrite(os.path.join(output_dir, f'{frame_index}.jpg'), frame_bgr)

def main(video_path, output_dir):
    frames, frame_ids = extract_frames(video_path)
    print(f"Extracted {len(frames)} frames from the video.")

    shot_boundaries = detect_shots(video_path)
    print(f"Detected {len(shot_boundaries)} shots in the video.")

    all_keyframes, all_keyframe_indices = [], []
    for start, end in shot_boundaries:
        shot_frames = frames[start:end+1]
        shot_frame_ids = frame_ids[start:end+1]

        features = extract_features(shot_frames)
        print(f"Extracted features for {len(features)} frames in shot {start}-{end}.")

        if len(features) < 2:
            print(f"Not enough features to perform clustering for shot {start}-{end}.")
            continue

        eps = estimate_eps(features)
        print(f"eps estimate for shot {start}-{end} is {eps}.")
        labels = dbscan_clustering(features, eps)
        print(f"Clustering resulted in {len(set(labels))} clusters for shot {start}-{end}.")

        keyframes, keyframe_features, keyframe_indices = select_keyframes(shot_frames, shot_frame_ids, features, labels)
        print(f"Selected {len(keyframes)} keyframes before filtering for shot {start}-{end}.")

        if len(keyframes) == 0:
            print(f"No keyframes selected for shot {start}-{end}.")
            continue

        filtered_keyframes, filtered_indices = filter_redundant_keyframes(keyframes, keyframe_features, keyframe_indices)
        print(f"Selected {len(filtered_keyframes)} keyframes after filtering for shot {start}-{end}.")

        all_keyframes.extend(filtered_keyframes)
        all_keyframe_indices.extend(filtered_indices)

    save_keyframes(all_keyframes, all_keyframe_indices, output_dir)
    print(f"Keyframes saved to {output_dir}")

if __name__ == "__main__":
    video_path = '/content/drive/MyDrive/KeyFrameExtraction/iVt07TCkFM0.mp4'
    output_dir = '/content/drive/MyDrive/KeyFrameExtraction/OUTPUT_KEYFRAME_iV'
    main(video_path, output_dir)

Collecting git+https://github.com/soCzech/TransNetV2.git
  Cloning https://github.com/soCzech/TransNetV2.git to /tmp/pip-req-build-v_b1yq20
  Running command git clone --filter=blob:none --quiet https://github.com/soCzech/TransNetV2.git /tmp/pip-req-build-v_b1yq20
  Resolved https://github.com/soCzech/TransNetV2.git to commit 85cef72af9a916bdfd7cc94a670c9cdfbf12d1ed
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: transnetv2
  Building wheel for transnetv2 (setup.py) ... [?25l[?25hdone
  Created wheel for transnetv2: filename=transnetv2-1.0.0-py3-none-any.whl size=28425498 sha256=1d4e1f9c65952f233ca98a64ff7b03402ae843270b4ea63a5c230c71f0473150
  Stored in directory: /tmp/pip-ephem-wheel-cache-e_jndsak/wheels/dd/37/2a/e0c49df7aa9542b855a332693206ad03e3cedd53874c1ea388
Successfully built transnetv2
Installing collected packages: transnetv2
Successfully installed transnetv2-1.0.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

[TransNetV2] Using weights from /usr/local/lib/python3.11/dist-packages/transnetv2/transnetv2-weights/.
Extracted 2500 frames from the video.
[TransNetV2] Extracting frames from /content/drive/MyDrive/KeyFrameExtraction/iVt07TCkFM0.mp4
[TransNetV2] Processing video frames 2500/2500
[TransNetV2] Processing video frames 2500/2500
Detected 29 shots in the video.


100%|██████████| 1/1 [00:00<00:00,  1.88it/s]


Extracted features for 21 frames in shot 0-20.
eps estimate for shot 0-20 is 0.5148258663359142.
Clustering resulted in 2 clusters for shot 0-20.
Selected 1 keyframes before filtering for shot 0-20.
num_keyframe: 
0
Selected 0 keyframes after filtering for shot 0-20.


100%|██████████| 6/6 [00:01<00:00,  3.70it/s]


Extracted features for 180 frames in shot 22-201.
eps estimate for shot 22-201 is 1.2269692927598954.
Clustering resulted in 9 clusters for shot 22-201.
Selected 8 keyframes before filtering for shot 22-201.
num_keyframe: 
8
Selected 1 keyframes after filtering for shot 22-201.


100%|██████████| 1/1 [00:00<00:00,  5.39it/s]


Extracted features for 16 frames in shot 202-217.
eps estimate for shot 202-217 is 3.337998181581497.
Clustering resulted in 2 clusters for shot 202-217.
Selected 1 keyframes before filtering for shot 202-217.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 202-217.


100%|██████████| 1/1 [00:00<00:00,  6.91it/s]


Extracted features for 12 frames in shot 218-229.
eps estimate for shot 218-229 is 0.5721879204114279.
Clustering resulted in 1 clusters for shot 218-229.
Selected 0 keyframes before filtering for shot 218-229.
No keyframes selected for shot 218-229.


100%|██████████| 1/1 [00:00<00:00,  7.28it/s]


Extracted features for 12 frames in shot 230-241.
eps estimate for shot 230-241 is 1.255791428188483.
Clustering resulted in 1 clusters for shot 230-241.
Selected 0 keyframes before filtering for shot 230-241.
No keyframes selected for shot 230-241.


100%|██████████| 1/1 [00:00<00:00,  6.94it/s]


Extracted features for 12 frames in shot 242-253.
eps estimate for shot 242-253 is 1.1590481474995613.
Clustering resulted in 1 clusters for shot 242-253.
Selected 0 keyframes before filtering for shot 242-253.
No keyframes selected for shot 242-253.


100%|██████████| 1/1 [00:00<00:00,  6.29it/s]


Extracted features for 13 frames in shot 254-266.
eps estimate for shot 254-266 is 0.4848368557599875.
Clustering resulted in 2 clusters for shot 254-266.
Selected 1 keyframes before filtering for shot 254-266.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 254-266.


100%|██████████| 2/2 [00:00<00:00,  3.81it/s]


Extracted features for 45 frames in shot 267-311.
eps estimate for shot 267-311 is 1.945671632554796.
Clustering resulted in 2 clusters for shot 267-311.
Selected 1 keyframes before filtering for shot 267-311.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 267-311.


100%|██████████| 2/2 [00:00<00:00,  5.01it/s]


Extracted features for 33 frames in shot 312-344.
eps estimate for shot 312-344 is 3.438728498690056.
Clustering resulted in 2 clusters for shot 312-344.
Selected 1 keyframes before filtering for shot 312-344.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 312-344.


100%|██████████| 1/1 [00:00<00:00,  3.49it/s]


Extracted features for 24 frames in shot 345-368.
eps estimate for shot 345-368 is 2.154177412390709.
Clustering resulted in 2 clusters for shot 345-368.
Selected 1 keyframes before filtering for shot 345-368.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 345-368.


100%|██████████| 2/2 [00:00<00:00,  4.29it/s]


Extracted features for 40 frames in shot 369-408.
eps estimate for shot 369-408 is 1.649598979949951.
Clustering resulted in 3 clusters for shot 369-408.
Selected 2 keyframes before filtering for shot 369-408.
num_keyframe: 
2
Selected 1 keyframes after filtering for shot 369-408.


100%|██████████| 1/1 [00:00<00:00,  3.77it/s]


Extracted features for 24 frames in shot 409-432.
eps estimate for shot 409-432 is 3.321813642978668.
Clustering resulted in 2 clusters for shot 409-432.
Selected 1 keyframes before filtering for shot 409-432.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 409-432.


100%|██████████| 1/1 [00:00<00:00,  4.47it/s]


Extracted features for 26 frames in shot 433-458.
eps estimate for shot 433-458 is 1.2067023469851568.
Clustering resulted in 1 clusters for shot 433-458.
Selected 0 keyframes before filtering for shot 433-458.
No keyframes selected for shot 433-458.


100%|██████████| 2/2 [00:00<00:00,  4.49it/s]


Extracted features for 58 frames in shot 459-516.
eps estimate for shot 459-516 is 2.7883842546364357.
Clustering resulted in 2 clusters for shot 459-516.
Selected 1 keyframes before filtering for shot 459-516.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 459-516.


100%|██████████| 5/5 [00:01<00:00,  4.66it/s]


Extracted features for 136 frames in shot 517-652.
eps estimate for shot 517-652 is 1.7961884276831852.
Clustering resulted in 1 clusters for shot 517-652.
Selected 0 keyframes before filtering for shot 517-652.
No keyframes selected for shot 517-652.


100%|██████████| 1/1 [00:00<00:00,  7.89it/s]


Extracted features for 16 frames in shot 653-668.
eps estimate for shot 653-668 is 3.7952201887965202.
Clustering resulted in 2 clusters for shot 653-668.
Selected 1 keyframes before filtering for shot 653-668.
num_keyframe: 
0
Selected 0 keyframes after filtering for shot 653-668.


100%|██████████| 12/12 [00:03<00:00,  4.00it/s]


Extracted features for 379 frames in shot 670-1048.
eps estimate for shot 670-1048 is 1.435271276490355.
Clustering resulted in 13 clusters for shot 670-1048.
Selected 12 keyframes before filtering for shot 670-1048.
num_keyframe: 
12
Selected 1 keyframes after filtering for shot 670-1048.


100%|██████████| 3/3 [00:00<00:00,  5.19it/s]


Extracted features for 76 frames in shot 1049-1124.
eps estimate for shot 1049-1124 is 1.3470632724071805.
Clustering resulted in 3 clusters for shot 1049-1124.
Selected 2 keyframes before filtering for shot 1049-1124.
num_keyframe: 
2
Selected 1 keyframes after filtering for shot 1049-1124.


100%|██████████| 4/4 [00:00<00:00,  4.87it/s]


Extracted features for 104 frames in shot 1125-1228.
eps estimate for shot 1125-1228 is 1.1385704823411429.
Clustering resulted in 6 clusters for shot 1125-1228.
Selected 5 keyframes before filtering for shot 1125-1228.
num_keyframe: 
5
Selected 1 keyframes after filtering for shot 1125-1228.


100%|██████████| 4/4 [00:00<00:00,  4.34it/s]


Extracted features for 122 frames in shot 1229-1350.
eps estimate for shot 1229-1350 is 0.7807541269747937.
Clustering resulted in 2 clusters for shot 1229-1350.
Selected 1 keyframes before filtering for shot 1229-1350.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 1229-1350.


100%|██████████| 2/2 [00:00<00:00,  4.94it/s]


Extracted features for 52 frames in shot 1351-1402.
eps estimate for shot 1351-1402 is 1.6681065055040212.
Clustering resulted in 3 clusters for shot 1351-1402.
Selected 2 keyframes before filtering for shot 1351-1402.
num_keyframe: 
2
Selected 1 keyframes after filtering for shot 1351-1402.


100%|██████████| 2/2 [00:00<00:00,  4.86it/s]


Extracted features for 52 frames in shot 1403-1454.
eps estimate for shot 1403-1454 is 0.828313220005769.
Clustering resulted in 2 clusters for shot 1403-1454.
Selected 1 keyframes before filtering for shot 1403-1454.
num_keyframe: 
1
Selected 1 keyframes after filtering for shot 1403-1454.


100%|██████████| 2/2 [00:00<00:00,  7.58it/s]


Extracted features for 33 frames in shot 1455-1487.
eps estimate for shot 1455-1487 is 0.992407556736108.
Clustering resulted in 3 clusters for shot 1455-1487.
Selected 2 keyframes before filtering for shot 1455-1487.
num_keyframe: 
2
Selected 1 keyframes after filtering for shot 1455-1487.


100%|██████████| 5/5 [00:01<00:00,  4.63it/s]


Extracted features for 137 frames in shot 1488-1624.
eps estimate for shot 1488-1624 is 0.6158397009555441.
Clustering resulted in 1 clusters for shot 1488-1624.
Selected 0 keyframes before filtering for shot 1488-1624.
No keyframes selected for shot 1488-1624.


100%|██████████| 3/3 [00:00<00:00,  4.10it/s]


Extracted features for 82 frames in shot 1625-1706.
eps estimate for shot 1625-1706 is 1.1298701679561196.
Clustering resulted in 4 clusters for shot 1625-1706.
Selected 3 keyframes before filtering for shot 1625-1706.
num_keyframe: 
3
Selected 1 keyframes after filtering for shot 1625-1706.


100%|██████████| 3/3 [00:00<00:00,  3.07it/s]


Extracted features for 88 frames in shot 1707-1794.
eps estimate for shot 1707-1794 is 1.2321878546340899.
Clustering resulted in 4 clusters for shot 1707-1794.
Selected 3 keyframes before filtering for shot 1707-1794.
num_keyframe: 
3
Selected 1 keyframes after filtering for shot 1707-1794.


100%|██████████| 3/3 [00:00<00:00,  3.04it/s]


Extracted features for 86 frames in shot 1795-1880.
eps estimate for shot 1795-1880 is 1.395015717938889.
Clustering resulted in 3 clusters for shot 1795-1880.
Selected 2 keyframes before filtering for shot 1795-1880.
num_keyframe: 
2
Selected 1 keyframes after filtering for shot 1795-1880.


100%|██████████| 19/19 [00:04<00:00,  3.82it/s]


Extracted features for 579 frames in shot 1881-2459.
eps estimate for shot 1881-2459 is 1.0220149340430362.
Clustering resulted in 8 clusters for shot 1881-2459.
Selected 7 keyframes before filtering for shot 1881-2459.
num_keyframe: 
7
Selected 2 keyframes after filtering for shot 1881-2459.


100%|██████████| 2/2 [00:00<00:00,  2.25it/s]


Extracted features for 39 frames in shot 2461-2499.
eps estimate for shot 2461-2499 is 0.5907039775415199.
Clustering resulted in 2 clusters for shot 2461-2499.
Selected 1 keyframes before filtering for shot 2461-2499.
num_keyframe: 
0
Selected 0 keyframes after filtering for shot 2461-2499.
Keyframes saved to /content/drive/MyDrive/KeyFrameExtraction/OUTPUT_KEYFRAME_iV
