# Audio to Text

In [None]:
pip install yt-dlp numpy opencv-python Pillow matplotlib moviepy transformers pydub torch openai-whisper ffmpeg-python tqdm

## (Optional) Donwload youtube video

In [None]:
import yt_dlp

# Replace with your YouTube video URL
video_url = 'https://www.youtube.com/watch?v=eUwR_CbezmQ'

# Optional: Set download options
ydl_opts = {
    'format': 'bestvideo+bestaudio/best',  # Best quality
    'outtmpl': 'downloaded_video.%(ext)s',  # Output filename
    'quiet': False,                         # Show progress
}

# Download the video
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])

## Import all libs

In [1]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import rcParams

from moviepy.config import check
from moviepy.audio.io.AudioFileClip import AudioFileClip
from pydub import AudioSegment
from pydub.playback import play
from transformers import pipeline
import re
import whisper
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Audio to text Transcriber (main logic)

In [3]:
from tqdm import tqdm
import whisper
import time

def convert_audio_to_text(audio_path, output_txt_path="transcription.txt", words_per_chunk=10):
    print("🔄 Loading Whisper model...")
    start_load = time.time()
    model = whisper.load_model("base") # models: "tiny", "base", "small", "medium", or "large"
    print(f"✅ Model loaded in {time.time() - start_load:.2f} seconds.\n")

    print("🎧 Transcribing audio... (this might take a while)")
    start_transcribe = time.time()
    result = model.transcribe(audio_path)
    transcribe_time = time.time() - start_transcribe
    print(f"✅ Transcription completed in {transcribe_time:.2f} seconds.\n")

    full_text = result["text"]

    print("✂️ Splitting text into chunks...")
    words = full_text.strip().split()
    chunks = [
        " ".join(words[i:i + words_per_chunk])
        for i in range(0, len(words), words_per_chunk)
    ]
    print(f"✅ Text split into {len(chunks)} chunks.\n")

    print(f"💾 Writing to file: {output_txt_path}")
    with open(output_txt_path, "w", encoding="utf-8") as f:
        for chunk in tqdm(chunks, desc="Writing chunks", unit="chunk"):
            f.write(chunk.strip() + "\n")

    print(f"\n✅ Transcription saved to {output_txt_path} with ~{words_per_chunk}-word chunks.")


### Extract audio from video

In [9]:
import ffmpeg
import os

def extract_audio_ffmpeg(video_path):
    audio_output_path = f"{video_path}.wav"
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")
    try:
        ffmpeg.input(video_path).output(audio_output_path, ac=1, ar='16000').run(overwrite_output=True)
        print(f"Audio extracted to: {audio_output_path}")
        return audio_output_path
    except ffmpeg.Error as e:
        print("FFmpeg error:", e.stderr.decode())
        raise

In [None]:
extracted_audio_path = extract_audio_ffmpeg('file.webm')

In [None]:
convert_audio_to_text(extracted_audio_path,f"{extracted_audio_path}.txt")