In [None]:
# --------------------------------------------------------
# Task 3: Whisper Transcription Bot for NLP Talks (~3 min)
# --------------------------------------------------------
# Deliverables:
#   - yt_audio/   (downloaded audio WAVs)
#   - yt_videos/  (downloaded mp4 videos)
#   - yt_frames/  (sampled video frames for OCR)
#   - talks_transcripts.jsonl (final transcripts with OCR)
# --------------------------------------------------------

import os
import json
import yt_dlp
import whisper
import cv2
import pytesseract
from datetime import timedelta

# ------------ Config ------------
VIDEO_LIST = [
    "https://www.youtube.com/shorts/EIjaeSubj7s",
    "https://www.youtube.com/shorts/tNaRvmRrPKY",
    "https://www.youtube.com/shorts/eLXOuk4WAvc",
    "https://www.youtube.com/shorts/cIjoYkOzY9w",
    "https://www.youtube.com/shorts/Y_G1bCrjm0w",
    "https://www.youtube.com/shorts/BQRloEZ5s0A",
    "https://www.youtube.com/shorts/-UzkxFp9gWc",
    "https://www.youtube.com/shorts/IdgebF3cH8I",
    "https://www.youtube.com/shorts/brBz8Phzc4Y",
    "https://www.youtube.com/shorts/wxSDV29kJsw"
]
AUDIO_DIR = "yt_audio"
VIDEO_DIR = "yt_videos"
FRAME_DIR = "yt_frames"
OUTPUT_FILE = "talks_transcripts.jsonl"
FFMPEG_PATH = r"C:\Program Files\ffmpeg\bin"  # <-- adjust if needed

# Ensure ffmpeg in PATH for Whisper
os.environ["PATH"] = FFMPEG_PATH + ";" + os.environ["PATH"]

# Create directories
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(FRAME_DIR, exist_ok=True)

# ------------ Step 1: Download Audio & Video ------------
def download_audio_video(url):
    ydl_opts = {
        "format": "bestaudio+bestaudio/best",
        "outtmpl": f"{VIDEO_DIR}/%(id)s.%(ext)s",
        "merge_output_format": "mp4",
        "ffmpeg_location": FFMPEG_PATH,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_id = info.get("id")
        video_path = os.path.join(VIDEO_DIR, f"{video_id}.mp4")
        audio_path = os.path.join(AUDIO_DIR, f"{video_id}.wav")

        # Extract audio separately
        ydl_audio_opts = {
            "format": "bestaudio/best",
            "outtmpl": f"{AUDIO_DIR}/%(id)s.%(ext)s",
            "postprocessors": [{
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
                "preferredquality": "192",
            }],
            "ffmpeg_location": FFMPEG_PATH,
        }
        with yt_dlp.YoutubeDL(ydl_audio_opts) as ydl_audio:
            ydl_audio.download([url])

    return video_id, video_path, audio_path

# ------------ Step 2: Whisper Transcription ------------
def transcribe_audio(audio_path):
    model = whisper.load_model("small")  # use "base" if you want faster
    result = model.transcribe(audio_path)
    segments = []
    for seg in result["segments"]:
        segments.append({
            "timestamp": str(timedelta(seconds=int(seg["start"]))),
            "speaker_text": seg["text"].strip(),
        })
    return segments

# ------------ Step 3: OCR on Video Frames ------------
def extract_frames_for_ocr(video_path, video_id, interval=10):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frames_ocr = {}
    frame_count, frame_id = 0, 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        current_time_sec = int(cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        if current_time_sec % interval == 0 and frame_count % int(fps * interval) == 0:
            frame_file = os.path.join(FRAME_DIR, f"{video_id}_{frame_id}.jpg")
            cv2.imwrite(frame_file, frame)
            text = pytesseract.image_to_string(frame, lang="eng").strip()
            if text:
                frames_ocr[str(timedelta(seconds=current_time_sec))] = text
            frame_id += 1
        frame_count += 1
    cap.release()
    return frames_ocr

# ------------ Step 4: Merge & Save JSONL ------------
with open(OUTPUT_FILE, "w", encoding="utf-8") as fout:
    for url in VIDEO_LIST:
        print(f"Processing: {url}")
        video_id, video_path, audio_path = download_audio_video(url)

        # Transcribe speech
        transcript_segments = transcribe_audio(audio_path)

        # OCR text from slides
        ocr_frames = extract_frames_for_ocr(video_path, video_id, interval=10)

        for seg in transcript_segments:
            ocr_text = ""
            if ocr_frames:
                nearest_time = max(
                    (t for t in ocr_frames.keys() if t <= seg["timestamp"]),
                    default=None
                )
                if nearest_time:
                    ocr_text = ocr_frames[nearest_time]

            record = {
                "video_id": video_id,
                "timestamp": seg["timestamp"],
                "speaker_text": seg["speaker_text"],
                "ocr_text": ocr_text
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ Completed. Transcripts saved in {OUTPUT_FILE}")


Processing: https://www.youtube.com/shorts/EIjaeSubj7s
[youtube] Extracting URL: https://www.youtube.com/shorts/EIjaeSubj7s
[youtube] EIjaeSubj7s: Downloading webpage
[youtube] EIjaeSubj7s: Downloading tv client config
[youtube] EIjaeSubj7s: Downloading tv player API JSON
[youtube] EIjaeSubj7s: Downloading ios player API JSON
[youtube] EIjaeSubj7s: Downloading m3u8 information
[info] EIjaeSubj7s: Downloading 1 format(s): 251
[download] Destination: yt_videos\EIjaeSubj7s.webm
[download] 100% of  671.89KiB in 00:00:00 at 2.90MiB/s   
[youtube] Extracting URL: https://www.youtube.com/shorts/EIjaeSubj7s
[youtube] EIjaeSubj7s: Downloading webpage
[youtube] EIjaeSubj7s: Downloading tv client config
[youtube] EIjaeSubj7s: Downloading tv player API JSON
[youtube] EIjaeSubj7s: Downloading ios player API JSON
[youtube] EIjaeSubj7s: Downloading m3u8 information
[info] EIjaeSubj7s: Downloading 1 format(s): 251
[download] Destination: yt_audio\EIjaeSubj7s.webm
[download] 100% of  671.89KiB in 00:00



Processing: https://www.youtube.com/shorts/tNaRvmRrPKY
[youtube] Extracting URL: https://www.youtube.com/shorts/tNaRvmRrPKY
[youtube] tNaRvmRrPKY: Downloading webpage
[youtube] tNaRvmRrPKY: Downloading tv client config
[youtube] tNaRvmRrPKY: Downloading tv player API JSON
[youtube] tNaRvmRrPKY: Downloading ios player API JSON
[youtube] tNaRvmRrPKY: Downloading m3u8 information
[info] tNaRvmRrPKY: Downloading 1 format(s): 251
[download] Destination: yt_videos\tNaRvmRrPKY.webm
[download] 100% of  807.40KiB in 00:00:00 at 3.40MiB/s   
[youtube] Extracting URL: https://www.youtube.com/shorts/tNaRvmRrPKY
[youtube] tNaRvmRrPKY: Downloading webpage
[youtube] tNaRvmRrPKY: Downloading tv client config
[youtube] tNaRvmRrPKY: Downloading tv player API JSON
[youtube] tNaRvmRrPKY: Downloading ios player API JSON
[youtube] tNaRvmRrPKY: Downloading m3u8 information
[info] tNaRvmRrPKY: Downloading 1 format(s): 251
[download] Destination: yt_audio\tNaRvmRrPKY.webm
[download] 100% of  807.40KiB in 00:00



Processing: https://www.youtube.com/shorts/eLXOuk4WAvc
[youtube] Extracting URL: https://www.youtube.com/shorts/eLXOuk4WAvc
[youtube] eLXOuk4WAvc: Downloading webpage
[youtube] eLXOuk4WAvc: Downloading tv client config
[youtube] eLXOuk4WAvc: Downloading tv player API JSON
[youtube] eLXOuk4WAvc: Downloading ios player API JSON
[youtube] eLXOuk4WAvc: Downloading m3u8 information
[info] eLXOuk4WAvc: Downloading 1 format(s): 251
[download] yt_videos\eLXOuk4WAvc.webm has already been downloaded
[download] 100% of  811.55KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/eLXOuk4WAvc
[youtube] eLXOuk4WAvc: Downloading webpage
[youtube] eLXOuk4WAvc: Downloading tv client config
[youtube] eLXOuk4WAvc: Downloading tv player API JSON
[youtube] eLXOuk4WAvc: Downloading ios player API JSON
[youtube] eLXOuk4WAvc: Downloading m3u8 information
[info] eLXOuk4WAvc: Downloading 1 format(s): 251
[download] Destination: yt_audio\eLXOuk4WAvc.webm
[download] 100% of  811.55KiB in 00:00:00 at 3.55Mi



Processing: https://www.youtube.com/shorts/cIjoYkOzY9w
[youtube] Extracting URL: https://www.youtube.com/shorts/cIjoYkOzY9w
[youtube] cIjoYkOzY9w: Downloading webpage
[youtube] cIjoYkOzY9w: Downloading tv client config
[youtube] cIjoYkOzY9w: Downloading tv player API JSON
[youtube] cIjoYkOzY9w: Downloading ios player API JSON
[youtube] cIjoYkOzY9w: Downloading m3u8 information
[info] cIjoYkOzY9w: Downloading 1 format(s): 251
[download] yt_videos\cIjoYkOzY9w.webm has already been downloaded
[download] 100% of  699.87KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/cIjoYkOzY9w
[youtube] cIjoYkOzY9w: Downloading webpage
[youtube] cIjoYkOzY9w: Downloading tv client config
[youtube] cIjoYkOzY9w: Downloading tv player API JSON
[youtube] cIjoYkOzY9w: Downloading ios player API JSON




[youtube] cIjoYkOzY9w: Downloading m3u8 information
[info] Testing format 234
[info] cIjoYkOzY9w: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 7
[download] Destination: yt_audio\cIjoYkOzY9w.mp4
[download] 100% of  598.28KiB in 00:00:01 at 413.20KiB/s               
[ExtractAudio] Destination: yt_audio\cIjoYkOzY9w.wav
Deleting original file yt_audio\cIjoYkOzY9w.mp4 (pass -k to keep)




Processing: https://www.youtube.com/shorts/Y_G1bCrjm0w
[youtube] Extracting URL: https://www.youtube.com/shorts/Y_G1bCrjm0w
[youtube] Y_G1bCrjm0w: Downloading webpage
[youtube] Y_G1bCrjm0w: Downloading tv client config
[youtube] Y_G1bCrjm0w: Downloading tv player API JSON
[youtube] Y_G1bCrjm0w: Downloading ios player API JSON
[youtube] Y_G1bCrjm0w: Downloading m3u8 information
[info] Y_G1bCrjm0w: Downloading 1 format(s): 251
[download] yt_videos\Y_G1bCrjm0w.webm has already been downloaded
[download] 100% of  380.50KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/Y_G1bCrjm0w
[youtube] Y_G1bCrjm0w: Downloading webpage
[youtube] Y_G1bCrjm0w: Downloading tv client config
[youtube] Y_G1bCrjm0w: Downloading tv player API JSON
[youtube] Y_G1bCrjm0w: Downloading ios player API JSON
[youtube] Y_G1bCrjm0w: Downloading m3u8 information
[info] Y_G1bCrjm0w: Downloading 1 format(s): 251
[download] Destination: yt_audio\Y_G1bCrjm0w.webm
[download] 100% of  380.50KiB in 00:00:00 at 1.58Mi



Processing: https://www.youtube.com/shorts/BQRloEZ5s0A
[youtube] Extracting URL: https://www.youtube.com/shorts/BQRloEZ5s0A
[youtube] BQRloEZ5s0A: Downloading webpage
[youtube] BQRloEZ5s0A: Downloading tv client config
[youtube] BQRloEZ5s0A: Downloading tv player API JSON
[youtube] BQRloEZ5s0A: Downloading ios player API JSON
[youtube] BQRloEZ5s0A: Downloading m3u8 information
[info] BQRloEZ5s0A: Downloading 1 format(s): 251
[download] yt_videos\BQRloEZ5s0A.webm has already been downloaded
[download] 100% of  740.38KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/BQRloEZ5s0A
[youtube] BQRloEZ5s0A: Downloading webpage
[youtube] BQRloEZ5s0A: Downloading tv client config
[youtube] BQRloEZ5s0A: Downloading tv player API JSON
[youtube] BQRloEZ5s0A: Downloading ios player API JSON
[youtube] BQRloEZ5s0A: Downloading m3u8 information
[info] BQRloEZ5s0A: Downloading 1 format(s): 251
[download] Destination: yt_audio\BQRloEZ5s0A.webm
[download] 100% of  740.38KiB in 00:00:00 at 3.28Mi



Processing: https://www.youtube.com/shorts/-UzkxFp9gWc
[youtube] Extracting URL: https://www.youtube.com/shorts/-UzkxFp9gWc
[youtube] -UzkxFp9gWc: Downloading webpage
[youtube] -UzkxFp9gWc: Downloading tv client config
[youtube] -UzkxFp9gWc: Downloading tv player API JSON
[youtube] -UzkxFp9gWc: Downloading ios player API JSON
[youtube] -UzkxFp9gWc: Downloading m3u8 information
[info] -UzkxFp9gWc: Downloading 1 format(s): 251
[download] yt_videos\-UzkxFp9gWc.webm has already been downloaded
[download] 100% of  793.97KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/-UzkxFp9gWc
[youtube] -UzkxFp9gWc: Downloading webpage
[youtube] -UzkxFp9gWc: Downloading tv client config
[youtube] -UzkxFp9gWc: Downloading tv player API JSON
[youtube] -UzkxFp9gWc: Downloading ios player API JSON
[youtube] -UzkxFp9gWc: Downloading m3u8 information
[info] -UzkxFp9gWc: Downloading 1 format(s): 251
[download] Destination: yt_audio\-UzkxFp9gWc.webm
[download] 100% of  793.97KiB in 00:00:00 at 3.54Mi



Processing: https://www.youtube.com/shorts/IdgebF3cH8I
[youtube] Extracting URL: https://www.youtube.com/shorts/IdgebF3cH8I
[youtube] IdgebF3cH8I: Downloading webpage
[youtube] IdgebF3cH8I: Downloading tv client config
[youtube] IdgebF3cH8I: Downloading tv player API JSON
[youtube] IdgebF3cH8I: Downloading ios player API JSON
[youtube] IdgebF3cH8I: Downloading m3u8 information
[info] IdgebF3cH8I: Downloading 1 format(s): 251
[download] yt_videos\IdgebF3cH8I.webm has already been downloaded
[download] 100% of  326.81KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/IdgebF3cH8I
[youtube] IdgebF3cH8I: Downloading webpage
[youtube] IdgebF3cH8I: Downloading tv client config
[youtube] IdgebF3cH8I: Downloading tv player API JSON
[youtube] IdgebF3cH8I: Downloading ios player API JSON
[youtube] IdgebF3cH8I: Downloading m3u8 information
[info] IdgebF3cH8I: Downloading 1 format(s): 251
[download] Destination: yt_audio\IdgebF3cH8I.webm
[download] 100% of  326.81KiB in 00:00:00 at 1.90Mi



Processing: https://www.youtube.com/shorts/brBz8Phzc4Y
[youtube] Extracting URL: https://www.youtube.com/shorts/brBz8Phzc4Y
[youtube] brBz8Phzc4Y: Downloading webpage
[youtube] brBz8Phzc4Y: Downloading tv client config
[youtube] brBz8Phzc4Y: Downloading tv player API JSON
[youtube] brBz8Phzc4Y: Downloading ios player API JSON
[youtube] brBz8Phzc4Y: Downloading m3u8 information
[info] brBz8Phzc4Y: Downloading 1 format(s): 251-8
[download] yt_videos\brBz8Phzc4Y.webm has already been downloaded
[download] 100% of  797.46KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/brBz8Phzc4Y
[youtube] brBz8Phzc4Y: Downloading webpage
[youtube] brBz8Phzc4Y: Downloading tv client config
[youtube] brBz8Phzc4Y: Downloading tv player API JSON
[youtube] brBz8Phzc4Y: Downloading ios player API JSON
[youtube] brBz8Phzc4Y: Downloading m3u8 information
[info] brBz8Phzc4Y: Downloading 1 format(s): 251-8
[download] Destination: yt_audio\brBz8Phzc4Y.webm
[download] 100% of  797.46KiB in 00:00:00 at 3.



Processing: https://www.youtube.com/shorts/wxSDV29kJsw
[youtube] Extracting URL: https://www.youtube.com/shorts/wxSDV29kJsw
[youtube] wxSDV29kJsw: Downloading webpage
[youtube] wxSDV29kJsw: Downloading tv client config
[youtube] wxSDV29kJsw: Downloading tv player API JSON
[youtube] wxSDV29kJsw: Downloading ios player API JSON
[youtube] wxSDV29kJsw: Downloading m3u8 information
[info] wxSDV29kJsw: Downloading 1 format(s): 251
[download] yt_videos\wxSDV29kJsw.webm has already been downloaded
[download] 100% of  280.90KiB
[youtube] Extracting URL: https://www.youtube.com/shorts/wxSDV29kJsw
[youtube] wxSDV29kJsw: Downloading webpage
[youtube] wxSDV29kJsw: Downloading tv client config
[youtube] wxSDV29kJsw: Downloading tv player API JSON
[youtube] wxSDV29kJsw: Downloading ios player API JSON
[youtube] wxSDV29kJsw: Downloading m3u8 information
[info] wxSDV29kJsw: Downloading 1 format(s): 251
[download] Destination: yt_audio\wxSDV29kJsw.webm
[download] 100% of  280.90KiB in 00:00:00 at 1.86Mi

