<a href="https://colab.research.google.com/github/jc890/python/blob/master/Movinet_video_analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
YouTube Multimodal Video Analysis using MoViNet
------------------------------------------------
- Video action/context analysis using MoViNet
- Placeholder for Audio + NLP conversational event detection
"""

import os
import subprocess
import yt_dlp
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

# =========================
# CONFIGURATION
# =========================
YOUTUBE_URL = "https://youtu.be/vHVR-7zURnw?si=-Q0iNtr-ucTVRWJH"

WORK_DIR = "workspace"
VIDEO_PATH = f"{WORK_DIR}/input.mp4"
CLIPS_DIR = f"{WORK_DIR}/clips"

BUFFER_SECONDS = 20
FRAME_SIZE = 172
MAX_FRAMES = 50   # IMPORTANT: keep clips short for MoViNet

MOVINET_MODEL_URL = (
    "https://www.kaggle.com/models/google/movinet/"
    "TensorFlow2/a0-base-kinetics-600-classification/3"
)

os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(CLIPS_DIR, exist_ok=True)

# =========================
# 1. DOWNLOAD YOUTUBE VIDEO
# =========================
def download_youtube_video(url):
    ydl_opts = {
        "format": "mp4",
        "outtmpl": VIDEO_PATH,
        "quiet": True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

# =========================
# 2. AUDIO + NLP PLACEHOLDER
# =========================
def detect_audio_events(video_path):
    """
    PLACEHOLDER FOR AUDIO + NLP PIPELINE

    Future:
    - ffmpeg audio extraction
    - Whisper ASR
    - NLP classifier

    Output format:
    [
        {"event": "Agreement", "timestamp": 60.0}
    ]
    """
    return [{"event": "Agreement", "timestamp": 60.0}]

# =========================
# 3. EXTRACT FRAMES
# =========================
def extract_frames(video_path, start_sec, end_sec):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    start_frame = int(start_sec * fps)
    end_frame = int(end_sec * fps)

    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    frames = []
    while cap.isOpened() and len(frames) < MAX_FRAMES:
        ret, frame = cap.read()
        if not ret or cap.get(cv2.CAP_PROP_POS_FRAMES) > end_frame:
            break

        frame = cv2.resize(frame, (FRAME_SIZE, FRAME_SIZE))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)

    cap.release()
    return np.array(frames, dtype=np.float32)

# =========================
# 4. LOAD MOVINET
# =========================
def load_movinet():
    """
    IMPORTANT:
    Do NOT wrap MoViNet in tf.keras.Model.
    Call hub.KerasLayer directly with real tensors.
    """
    return hub.KerasLayer(MOVINET_MODEL_URL, trainable=False)

# =========================
# 5. RUN MOVINET INFERENCE
# =========================
def run_movinet(video_frames):
    """
    video_frames shape:
    (T, H, W, 3)
    """

    if len(video_frames) == 0:
        return None

    movinet = load_movinet()

    # Normalize & add batch dimension
    video_frames = video_frames / 255.0
    video_tensor = tf.expand_dims(video_frames, axis=0)  # (1, T, H, W, 3)

    outputs = movinet({"image": video_tensor}, training=False)
    probs = tf.nn.softmax(outputs, axis=-1)

    return int(tf.argmax(probs, axis=-1).numpy()[0])

# =========================
# 6. CLIP FINAL VIDEO
# =========================
def clip_video(input_video, start, end, out_path):
    cmd = [
        "ffmpeg", "-y",
        "-ss", str(start),
        "-to", str(end),
        "-i", input_video,
        "-c", "copy",
        out_path
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# =========================
# 7. MAIN PIPELINE
# =========================
def main():
    print("Downloading YouTube video...")
    download_youtube_video(YOUTUBE_URL)

    print("Detecting audio events (placeholder)...")
    events = detect_audio_events(VIDEO_PATH)

    for idx, event in enumerate(events):
        t = event["timestamp"]
        start = max(0, t - BUFFER_SECONDS)
        end = t + BUFFER_SECONDS

        print(f"\nProcessing event '{event['event']}' at {t}s")

        frames = extract_frames(VIDEO_PATH, start, end)
        if len(frames) == 0:
            print("No frames extracted. Skipping.")
            continue

        action_id = run_movinet(frames)
        print(f"MoViNet action label ID: {action_id}")

        out_clip = f"{CLIPS_DIR}/{event['event']}_{idx}.mp4"
        clip_video(VIDEO_PATH, start, end, out_clip)

        print(f"Saved clip: {out_clip}")

    print("\nPipeline completed successfully.")

# =========================
# ENTRY POINT
# =========================
if __name__ == "__main__":
    main()

Downloading YouTube video...




Detecting audio events (placeholder)...

Processing event 'Agreement' at 60.0s
MoViNet action label ID: 539
Saved clip: workspace/clips/Agreement_0.mp4

Pipeline completed successfully.


In [None]:
!pip install yt_dlp

Collecting yt_dlp
  Downloading yt_dlp-2025.12.8-py3-none-any.whl.metadata (180 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/180.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.12.8-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp
Successfully installed yt_dlp-2025.12.8
