In [None]:
ACTS = [
    {"name": "Teaser",             "start": 0.00,   "end": 0.06,   "ratio": 0.05},
    {"name": "Act 1 / Завязка",    "start": 0.06,   "end": 0.25,   "ratio": 0.07},
    {"name": "Act 2 / Развитие",   "start": 0.25,   "end": 0.50,   "ratio": 0.23},
    {"name": "Act 3 / Подъём",     "start": 0.50,   "end": 0.75,   "ratio": 0.20},
    {"name": "Climax / Кульминация","start": 0.75,  "end": 0.875,  "ratio": 0.20},
    {"name": "Tag / Развязка",     "start": 0.875,  "end": 1.00,   "ratio": 0.25},
]

In [19]:
TRIM_START = 0
TRIM_END = 45

In [21]:
import os
import cv2
import librosa
import numpy as np
import subprocess
import matplotlib.pyplot as plt
import subprocess
import json

VIDEO_PATH = "episode.mkv"
OUTPUT_RECAP = "recap_action.mp4"
CLIP_LENGTH = 6
TOP_K = 40
FRAME_STEP = 5
AUDIO_SAMPLE_RATE = 22050
PLOT_PATH = "activity_plot.png"

def get_duration(video_path):
    result = subprocess.run([
        "ffprobe", "-v", "error",
        "-show_entries", "format=duration",
        "-of", "json", video_path
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    return int(float(json.loads(result.stdout)["format"]["duration"]))

def extract_audio_activity(video_path, sr=AUDIO_SAMPLE_RATE):
    y, _ = librosa.load(video_path, sr=sr)
    rms = librosa.feature.rms(y=y)[0]
    onset = librosa.onset.onset_strength(y=y, sr=sr)
    flatness = librosa.feature.spectral_flatness(y=y)[0]

    # выровнять длину
    min_len = min(len(rms), len(onset), len(flatness))
    rms = normalize(rms[:min_len])
    onset = normalize(onset[:min_len])
    flatness = normalize(flatness[:min_len])

    # смешиваем: RMS (общая громкость) + Onset (всплески) + Flatness (шум)
    return 0.5 * rms + 0.35 * onset + 0.15 * flatness

def extract_visual_diffs(video_path, every_n_frames=FRAME_STEP):
    cap = cv2.VideoCapture(video_path)
    prev_gray = None
    diffs = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % every_n_frames == 0:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            if prev_gray is not None:
                diff = cv2.absdiff(prev_gray, gray)
                score = diff.mean()
                diffs.append(score)
            prev_gray = gray
        frame_count += 1

    cap.release()
    return diffs

def normalize(x):
    x = np.array(x)
    return (x - x.min()) / (x.max() - x.min() + 1e-6)

def combine_activity(audio_rms, visual_diffs):
    min_len = min(len(audio_rms), len(visual_diffs))
    a = normalize(audio_rms[:min_len])
    v = normalize(visual_diffs[:min_len])
    return 0.75 * a + 0.25 * v

def build_activity_per_second(activity, fps, duration):
    per_second = np.zeros(int(duration))
    for i, val in enumerate(activity):
        sec = int(i / fps)
        if sec < len(per_second):
            per_second[sec] += val
    return per_second

def find_top_intervals_by_acts(activity, duration, clip_len=10, top_k=10, acts=None):
    seconds = build_activity_per_second(activity, fps=1, duration=duration)
    selected = []
    used = set()

    trim_start = TRIM_START
    trim_end = TRIM_END
    duration_eff = duration - trim_end

    for act in acts:
        s_start = int(duration * act["start"])
        s_end = int(duration * act["end"])

        s_start = max(s_start, trim_start)
        s_end = min(s_end, duration_eff)

        act_top_k = max(1, int(top_k * act["ratio"]))
        act_range = range(s_start, s_end)
        sorted_indices = sorted(act_range, key=lambda i: seconds[i], reverse=True)

        count = 0
        for sec in sorted_indices:
            if count >= act_top_k:
                break
            if any(abs(sec - s) < clip_len for s in used):
                continue
            used.add(sec)
            selected.append((max(0, sec - clip_len // 2), min(duration, sec + clip_len // 2)))
            count += 1

    return sorted(selected)

def extract_visual_diffs(video_path, every_n_frames=10):
    cap = cv2.VideoCapture(video_path)
    prev_gray = None
    diffs = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % every_n_frames == 0:
            frame = cv2.resize(frame, (320, 180))
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

            if prev_gray is not None:
                flow = cv2.calcOpticalFlowFarneback(
                    prev_gray, gray, None,
                    0.5, 3, 15, 3, 5, 1.2, 0
                )
                mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
                motion_score = np.mean(mag)
                diffs.append(motion_score)

            prev_gray = gray

        frame_count += 1

    cap.release()
    return diffs

def concat_clips_ffmpeg(clip_paths, output_path):
    list_path = "concat_activity.txt"
    with open(list_path, "w") as f:
        for p in clip_paths:
            f.write(f"file '{os.path.abspath(p)}'\n")
    cmd = [
        "ffmpeg", "-f", "concat", "-safe", "0", "-i", list_path,
        "-c", "copy", "-y", output_path
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def extract_clips_ffmpeg(video_path, intervals, output_dir="clips_activity"):
    os.makedirs(output_dir, exist_ok=True)
    paths = []

    for idx, (start, end) in enumerate(intervals):
        out_path = os.path.join(output_dir, f"clip_{idx:02}.mp4")
        cmd = [
            "ffmpeg", "-ss", str(start), "-i", video_path,
            "-t", str(end - start),
            "-c:v", "libx264", "-preset", "fast",
            "-c:a", "aac", "-b:a", "128k", "-y", out_path
        ]
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        paths.append(out_path)

    return paths

def plot_activity_graph(activity, save_path):
    plt.figure(figsize=(12, 4))
    plt.plot(activity, label="Combined Audio+Visual Activity")
    plt.xlabel("Time Frame Index")
    plt.ylabel("Normalized Intensity")
    plt.title("Activity Over Time")
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def detect_music(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y)[0]
    flatness = librosa.feature.spectral_flatness(y=y)[0]
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    return flatness.mean() > 0.15 and tempo > 60 and zcr.mean() > 0.05

def detect_speech(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    return np.std(delta) < 20

def detect_static_visuals(video_path, duration=420, step=2, threshold=3.0):
    cap = cv2.VideoCapture(video_path)
    prev_frame = None
    static_score = []

    for i in range(0, duration, step):
        cap.set(cv2.CAP_PROP_POS_MSEC, i * 1000)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (160, 90))
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        if prev_frame is not None:
            diff = np.abs(gray.astype(np.float32) - prev_frame.astype(np.float32)).mean()
            static_score.append(diff)
        prev_frame = gray

    cap.release()
    return np.mean(static_score) < threshold

def main():
    print("Чтение длительности видео...")
    duration = get_duration(VIDEO_PATH)
    print(f"Длительность: {duration} сек")

    print("Извлечение аудиоэнергии...")
    audio = extract_audio_activity(VIDEO_PATH)

    print("Анализ визуальной изменчивости...")
    visual = extract_visual_diffs(VIDEO_PATH)

    print("Объединение активности...")
    activity = combine_activity(audio, visual)

    print("Построение графика активности...")
    plot_activity_graph(activity, PLOT_PATH)
    print(f"Сохранён: {PLOT_PATH}")

    print("Поиск пиков активности...")
    intervals = find_top_intervals_by_acts(
        activity,
        duration,
        clip_len=CLIP_LENGTH,
        top_k=TOP_K,
        acts=ACTS
    )

    for i, (start, end) in enumerate(intervals):
        print(f"{i+1}. {start}s → {end}s")

    print("Вырезка клипов...")
    clips = extract_clips_ffmpeg(VIDEO_PATH, intervals)

    print("Склейка recap...")
    concat_clips_ffmpeg(clips, OUTPUT_RECAP)

    print(f"Готово! Итоговый файл: {OUTPUT_RECAP}")

if __name__ == "__main__":
    main()

Чтение длительности видео...
Длительность: 3697 сек
Извлечение аудиоэнергии...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Анализ визуальной изменчивости...
Объединение активности...
Построение графика активности...
Сохранён: activity_plot.png
🎯 Поиск пиков активности...
1. 45s → 51s
2. 178s → 184s
3. 456s → 462s
4. 643s → 649s
5. 1175s → 1181s
6. 1189s → 1195s
7. 1252s → 1258s
8. 1336s → 1342s
9. 1342s → 1348s
10. 1348s → 1354s
11. 1363s → 1369s
12. 1383s → 1389s
13. 1452s → 1458s
14. 2456s → 2462s
15. 2470s → 2476s
16. 2479s → 2485s
17. 2487s → 2493s
18. 2498s → 2504s
19. 2710s → 2716s
20. 2724s → 2730s
21. 2734s → 2740s
22. 2791s → 2797s
23. 2822s → 2828s
24. 2838s → 2844s
25. 2911s → 2917s
26. 2930s → 2936s
27. 2965s → 2971s
28. 2985s → 2991s
29. 2991s → 2997s
30. 3254s → 3260s
31. 3367s → 3373s
32. 3375s → 3381s
33. 3400s → 3406s
34. 3410s → 3416s
35. 3434s → 3440s
36. 3464s → 3470s
37. 3517s → 3523s
38. 3602s → 3608s
39. 3608s → 3614s
Вырезка клипов...
Склейка recap...
Готово! Итоговый файл: recap_action.mp4
