#### Содержание блокнота
Этот Jupyter-блокнот предназначен для автоматической раскадровки видеофайлов на отдельные изображения. Процесс включает в себя загрузку видео, его обработку и сохранение кадров в формате изображений.

#### Установка зависимостей

In [None]:
!pip install yt-dlp numpy keras pillow scikit-learn tensorflow
!apt install ffmpeg

#### Параметры

In [None]:
OUTPUT_VIDEOS: str = 'output_videos'
URL_CONFIG: str = 'urls.txt'

OUTPUT_DATASET: str = 'dataset'
STEP_SECONDS: int = 2

#### Код для нахождения и удаления похожих изображений

In [2]:
import numpy as np
import os

from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16, preprocess_input


def load_and_preprocess_image(img_path, target_size=(224, 224)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array


def extract_features(model, img_path):
    img = load_and_preprocess_image(img_path)
    features = model.predict(img)
    return features.flatten()


def find_duplicates(directory, threshold=0.9):
    model = VGG16(weights='imagenet', include_top=False, pooling='avg')
    image_features = {}
    image_paths = []

    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            file_path = os.path.join(directory, filename)
            features = extract_features(model, file_path)
            image_features[file_path] = features
            image_paths.append(file_path)

    num_images = len(image_paths)
    i = 0
    while i < num_images:
        j = i+1

        while j < num_images:
            similarity = cosine_similarity([image_features[image_paths[i]]], [
                                           image_features[image_paths[j]]])
            if similarity[0][0] > threshold:
                print(
                    f"Duplicate found: {image_paths[i]} and {image_paths[j]} (similarity: {similarity[0][0]:.2f})")
                os.remove(image_paths[j])
                print(f"Deleted: {image_paths[j]}")
                image_paths.remove(image_paths[j])
                num_images -= 1
                j -= 1
            j += 1

        i += 1


#### Раскадровка видео

In [3]:
import os

def cut_video(VIDEO_PATH: str, STEP_SECONDS: int, OUTPUT_PATH: str):
    if not os.path.exists(OUTPUT_PATH):
        os.mkdir(OUTPUT_PATH)

    output_file: str = os.path.splitext(os.path.basename(VIDEO_PATH))[0]
    os.system(
        f'ffmpeg -i "{VIDEO_PATH}" -vf "fps=1/{STEP_SECONDS}" -qscale:v 4 "{OUTPUT_PATH}/{output_file}_%03d.jpg"'
    )

#### Загрузка видео

In [4]:
def download_video(urls: list, output_path: str) -> None:
    if not os.path.exists(output_path):
        os.mkdir(output_path)
        
    urls = list(map(lambda x: x.strip().replace('\n', ''), urls))
    os.system(f'yt-dlp {" ".join(urls)} -P "{output_path}"')

In [None]:
with open(URL_CONFIG, 'r', encoding='utf-8') as file:
    download_video(file.readlines(), OUTPUT_VIDEOS)
    
# URLS: list[str] = [
#     'https://youtu.be/jOMJHcPfM_c?si=aD-aqEx6SLX-pG9_'
# ]


# download_video(URLS, OUTPUT_VIDEOS)


#### Список с путями видео

In [12]:
videos: list[str] = list(map(lambda x: os.path.join(OUTPUT_VIDEOS, x), os.listdir(OUTPUT_VIDEOS)))

#### Обработка изображений

In [None]:
if not os.path.exists(OUTPUT_DATASET):
    os.mkdir(OUTPUT_DATASET)
    
for video in videos:
    if video.lower().endswith('part'): continue
    OUTPUT_PATH: str = os.path.join(OUTPUT_DATASET, os.path.splitext(os.path.basename(video))[0])
    VIDEO_PATH: str = video

    cut_video(VIDEO_PATH, STEP_SECONDS, OUTPUT_PATH)
    find_duplicates(OUTPUT_PATH, threshold=0.97)

#### Добавляем в архив

In [None]:
os.system(f'zip -r dataset2.zip {OUTPUT_DATASET}')