# Install libraries

In [1]:
! pip install git+https://github.com/openai/whisper.git
! pip install yt-dlp

import sys
import warnings
import whisper
from pathlib import Path
import yt_dlp
import subprocess
import torch
import shutil
import numpy as np

device = torch.device('cuda:0')
print('Using device:', device, file=sys.stderr)

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-4ge6hd6o
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-4ge6hd6o
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

Using device: cuda:0


# Model selection

In [2]:
model = "medium"

whisper_model = whisper.load_model(model)

100%|█████████████████████████████████████| 1.42G/1.42G [00:58<00:00, 25.9MiB/s]


In [4]:
# Load URLs from file

with open("../selected_videos/video_urls.txt", "r") as f:
    urls = [line.strip() for line in f if line.strip()]

In [5]:
# Prepare download folder
download_dir = Path("../data/audio")
download_dir.mkdir(parents=True, exist_ok=True)

# Prepare output folder
output_dir = Path("../data/whisper_transcriptions")
output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
# This sets the download settings:

# format: download the best audio available
# outtmpl: save the file using the video ID (e.g. abc123.wav).
# postprocessors: after downloading, convert the audio to WAV format using FFmpeg.

ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': str(download_dir / 'temp_audio.%(ext)s'),
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }]
}

In [7]:
# This cell downloads audio from a YouTube video, converts it to a .wav file, and saves the file path so we can use it later

video_path_local_list = []

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for i, url in enumerate(urls, start=1):
        print(f"⬇️ Downloading video {i}: {url}")
        ydl.download([url])
        video_info = ydl.extract_info(url, download=False)

        # Rename temp_audio.wav → audio_1.wav, audio_2.wav, ...
        audio_path = download_dir / f"audio_{i}.wav"
        temp_path = download_dir / "temp_audio.wav"
        temp_path.rename(audio_path)
        video_path_local_list.append((i, audio_path))

        print(f"✅ Saved audio as: {audio_path}")

⬇️ Downloading video 1: https://youtu.be/JWC-cVQmEmY?si=lEXWfF0ZYRJXx0gx
[youtube] Extracting URL: https://youtu.be/JWC-cVQmEmY?si=lEXWfF0ZYRJXx0gx
[youtube] JWC-cVQmEmY: Downloading webpage
[youtube] JWC-cVQmEmY: Downloading tv client config
[youtube] JWC-cVQmEmY: Downloading player a10d7fcc-main
[youtube] JWC-cVQmEmY: Downloading tv player API JSON
[youtube] JWC-cVQmEmY: Downloading ios player API JSON
[youtube] JWC-cVQmEmY: Downloading m3u8 information
[info] JWC-cVQmEmY: Downloading 1 format(s): 140
[download] Destination: ../data/audio/temp_audio.m4a
[download] 100% of    3.69MiB in 00:00:00 at 14.17MiB/s  
[FixupM4a] Correcting container of "../data/audio/temp_audio.m4a"
[ExtractAudio] Destination: ../data/audio/temp_audio.wav
Deleting original file ../data/audio/temp_audio.m4a (pass -k to keep)
[youtube] Extracting URL: https://youtu.be/JWC-cVQmEmY?si=lEXWfF0ZYRJXx0gx
[youtube] JWC-cVQmEmY: Downloading webpage
[youtube] JWC-cVQmEmY: Downloading tv client config
[youtube] JWC-cVQ

In [8]:
language = "English"
verbose = 'Live transcription'
output_format = 'txt'
task = 'transcribe'

In [9]:
temperature = 0.15                      # Lower = more confident (less guessing)
temperature_increment_on_fallback = 0.2 # Try higher temps if confidence is too low
best_of = 5                             # Try 5 options and pick best one
beam_size = 8                           # Search through 8 transcription paths
patience = 1.0                          # How long to wait before settling on an answer
length_penalty = -0.05                 # Negative = prefer shorter output
suppress_tokens = "-1"                 # Don't suppress anything
initial_prompt = ""                    # No custom prompt at the start
condition_on_previous_text = True      # Use earlier words to guess next
fp16 = True                            # Use faster GPU calculation if possible
compression_ratio_threshold = 2.4      # Skip weird/noisy chunks
logprob_threshold = -1.0               # Accept low-confidence words
no_speech_threshold = 0.6              # Ignore silent segments under this score

In [10]:
verbose_lut = {
    'Live transcription': True,
    'Progress bar': False,
    'None': None
}

In [11]:
# Put all settings in one dictionary

args = dict(
    language = language,
    verbose = verbose_lut[verbose],
    task = task,
    temperature = temperature,
    temperature_increment_on_fallback = temperature_increment_on_fallback,
    best_of = best_of,
    beam_size = beam_size,
    patience=patience,
    length_penalty=(length_penalty if length_penalty>=0.0 else None),
    suppress_tokens=suppress_tokens,
    initial_prompt=(None if not initial_prompt else initial_prompt),
    condition_on_previous_text=condition_on_previous_text,
    fp16=fp16,
    compression_ratio_threshold=compression_ratio_threshold,
    logprob_threshold=logprob_threshold,
    no_speech_threshold=no_speech_threshold
)

In [12]:
temperature = tuple(np.arange(
    args.pop("temperature"),
    1.0 + 1e-6,
    args.pop("temperature_increment_on_fallback")
))

In [13]:
for i, audio_path in video_path_local_list:
    print(f"📝 Transcribing: {audio_path.name}")

    video_transcription = whisper.transcribe(
        whisper_model,
        str(audio_path),
        temperature=temperature,
        **args,
    )

    whisper.utils.get_writer(
        output_format='txt',
        output_dir=output_dir
    )(
        video_transcription,
        f"whisper_{i}",
        options=dict(
            highlight_words=False,
            max_line_count=None,
            max_line_width=None,
        )
    )

    print(f"✅ Transcription saved as: {output_dir / f'whisper_{i}.txt'}")

📝 Transcribing: audio_1.wav
[00:00.000 --> 00:11.840]  Can you tell us your name?
[00:11.840 --> 00:13.480]  Mike Caputo.
[00:13.480 --> 00:14.640]  And Mike when was your stroke?
[00:14.640 --> 00:22.560]  I was seven years ago.
[00:22.560 --> 00:25.360]  And what did you used to do?
[00:25.360 --> 00:41.520]  Well, worked on a desk, seven sales and worldwide and very good.
[00:41.520 --> 00:44.200]  And who are you looking at over there?
[00:44.200 --> 00:46.600]  That's my wife.
[00:46.600 --> 00:49.480]  And why is she helping you to talk?
[00:49.480 --> 00:54.520]  She's a speech.
[00:54.520 --> 00:56.240]  So you have trouble with your speech?
[00:56.240 --> 00:57.240]  Yeah, yeah.
[00:57.240 --> 00:59.240]  And what's that called?
[00:59.240 --> 01:00.240]  Phasia.
[01:00.240 --> 01:01.600]  All right.
[01:01.600 --> 01:04.240]  And so why don't you work now?
[01:04.240 --> 01:09.680]  I, I, well, I do.
[01:09.680 --> 01:11.240]  And what do you do now?
[01:11.240 --> 01:13.320]