# YouGPTube 🦾

## TL;DR 👇

* Summarize any YouTube video using whisper and chatGPT

## How it works 🤔

![yougptube](https://user-images.githubusercontent.com/18450628/229377710-95fb8645-3d71-47d0-b3ba-0fd05941b083.png)

Here are the main steps:

1) Extract the audio using youtube-dl
2) Process the audio into smaller chunks
3) Each chunk is transcribed using whisper, OpenAI's powerful speech2text model
4) Each transcription is summarized using ChatGPT

## Imports and dependencies️ ⚙️

In [39]:
import os
import shutil

import librosa
import soundfile as sf
import youtube_dl
from youtube_dl.utils import DownloadError
import google.generativeai as genai
from dotenv import load_dotenv

# Configure Gemini (Google Generative AI)
GOOGLE_API_KEY = "AIzaSyBasyDzmOWkXp7PZ8N-PedYZoqFwmOW5tY"
genai.configure(api_key=GOOGLE_API_KEY)

# Load environment variables
load_dotenv()

# Ensure the Google API key is set
assert GOOGLE_API_KEY is not None, "Please set your Google API key in the environment variable GOOGLE_API_KEY"


## Utility functions 🔋

In [40]:
def find_audio_files(path, extension=".mp3"):
    """Recursively find all files with extension in path."""
    audio_files = []
    for root, dirs, files in os.walk(path):
        for f in files:
            if f.endswith(extension):
                audio_files.append(os.path.join(root, f))

    return audio_files

In [41]:
import yt_dlp

def youtube_to_mp3(youtube_url: str, output_dir: str) -> str:
    """Download the audio from a YouTube video, save it to output_dir as an .mp3 file.

    Returns the filename of the saved audio file.
    """
    # Configuration for youtube-dl
    ydl_config = {
        "format": "bestaudio/best",
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
        "outtmpl": os.path.join(output_dir, "%(title)s.%(ext)s"),
        "verbose": True,
    }

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Downloading video from {youtube_url}")

    try:
        # Use yt-dlp instead of youtube-dl
        with yt_dlp.YoutubeDL(ydl_config) as ydl:
            ydl.download([youtube_url])
    except DownloadError as e:
        print(f"Initial download failed due to: {e}. Retrying...")
        # Use yt-dlp instead of youtube-dl
        with yt_dlp.YoutubeDL(ydl_config) as ydl:
            ydl.download([youtube_url])

    audio_files = find_audio_files(output_dir)
    if not audio_files:
        raise FileNotFoundError("No audio file was found in the output directory.")

    return audio_files[0]

## Download youtube audio 🔈

## Chunk the audio 🍪

Chunking is necessary in the case where we have very long audio files, since both whisper and ChatGPT have limits of how much audio/text you can process in one go.
It is not necessary for shorter videos.

In [42]:
def chunk_audio(filename, segment_length: int, output_dir, output_format=".mp3"):
    """Segment the audio into chunks of specified length (in seconds) and save them.

    Args:
        filename (str): Path to the audio file.
        segment_length (int): Length of each segment in seconds.
        output_dir (str): Directory to save the audio chunks.
        output_format (str): File format for the output audio chunks (default: .mp3).

    Returns:
        list: Sorted list of paths to the chunked audio files.
    """
    if segment_length <= 0:
        raise ValueError("Segment length must be greater than 0.")

    if not os.path.exists(filename):
        raise FileNotFoundError(f"Audio file not found: {filename}")

    print(f"Chunking audio into {segment_length}-second segments...")

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # Load the audio file
    audio, sr = librosa.load(filename, sr=44100)

    # Calculate duration and number of segments
    duration = librosa.get_duration(y=audio, sr=sr)
    num_segments = int(duration / segment_length) + 1

    print(f"Total duration: {duration:.2f} seconds. Creating {num_segments} chunks...")

    # Segment the audio and save
    for i in range(num_segments):
        start = int(i * segment_length * sr)
        end = int(min((i + 1) * segment_length * sr, len(audio)))
        segment = audio[start:end]
        segment_file = os.path.join(output_dir, f"segment_{i}{output_format}")
        sf.write(segment_file, segment, sr)
        print(f"Saved segment {i + 1}/{num_segments} to {segment_file}")

    chunked_audio_files = find_audio_files(output_dir, extension=output_format)
    return sorted(chunked_audio_files)


## Speech2text 🗣

Here we use OpenAI's whisper model to transcribe audio files to text.

In [43]:
import whisper

def transcribe_audio(audio_files: list, output_file=None, model_size="base") -> list:
    """Transcribe audio files into text using the local Whisper model.

    Args:
        audio_files (list): List of paths to audio files.
        output_file (str): Optional file to save all transcripts.
        model_size (str): The size of the Whisper model to use (e.g., "tiny", "base", "small", "medium", "large").

    Returns:
        list: A list of transcripts for each audio file.
    """
    print("Loading Whisper model...")
    model = whisper.load_model(model_size)

    transcripts = []
    for audio_file in audio_files:
        print(f"Processing file: {audio_file}")
        # Perform transcription
        result = model.transcribe(audio_file)
        transcript = result["text"]
        transcripts.append(transcript)

    if output_file is not None:
        # Save all transcripts to a .txt file
        with open(output_file, "w") as file:
            for transcript in transcripts:
                file.write(transcript + "\n")

    return transcripts


## Summarize 📝

Here we ask chatGPT to take the raw transcripts and transcribe them for us to short bullet points.

In [58]:
# from google.generativeai import genai

def summarize(chunks, system_prompt, model='google-gemini', output_file=None):
    summaries = []

    # Combine chunks into one text input
    full_input = system_prompt + "\n".join(chunks)

    # Generate summary using Gemini model (or similar API call)
    try:
        response = genai.generate_text(prompt=full_input, max_output_tokens=150)
        summary = response.result  # This will be the generated summary
        summaries.append(summary)
    except Exception as e:
        print(f"Error during summary generation: {e}")

    # Optionally write to output file
    if output_file:
        with open(output_file, 'w') as f:
            f.write("\n".join(summaries))

    return summaries


## Putting it all together 🍱

In [59]:
def summarize_youtube_video(youtube_url, outputs_dir):
    raw_audio_dir = f"{outputs_dir}/raw_audio/"
    chunks_dir = f"{outputs_dir}/chunks"
    transcripts_file = f"{outputs_dir}/transcripts.txt"
    summary_file = f"{outputs_dir}/summary.txt"
    segment_length = 10 * 60  # chunk to 10 minute segments

    if os.path.exists(outputs_dir):
        # delete the outputs_dir folder and start from scratch
        shutil.rmtree(outputs_dir)
        os.mkdir(outputs_dir)

    # Download the video using youtube-dl
    audio_filename = youtube_to_mp3(youtube_url, output_dir=raw_audio_dir)

    # Chunk each audio file into shorter segments
    chunked_audio_files = chunk_audio(
        audio_filename, segment_length=segment_length, output_dir=chunks_dir
    )

    # Transcribe each chunked audio file using Whisper speech-to-text
    transcriptions = transcribe_audio(chunked_audio_files, transcripts_file)

    # Summarize each transcription using Google Gemini
    system_prompt = """
    You are a helpful assistant that summarizes YouTube videos.
    You are provided chunks of raw audio that were transcribed from the video's audio.
    Summarize the current chunk into succinct and clear bullet points of its contents.
    """
    summaries = summarize(
        transcriptions, system_prompt=system_prompt, output_file=summary_file
    )

    system_prompt_tldr = """
    You are a helpful assistant that summarizes YouTube videos.
    Someone has already summarized the video into key points.
    Summarize the key points into one or two sentences that capture the essence of the video.
    """
    # Combine all summaries into a long summary
    long_summary = "\n".join(summaries)

    # Summarize the long summary to get the final short summary
    short_summary = summarize(
        [long_summary], system_prompt=system_prompt_tldr, output_file=summary_file
    )[0]

    return long_summary, short_summary


In [60]:
# API Key for Google Gemini
GOOGLE_API_KEY = "AIzaSyBasyDzmOWkXp7PZ8N-PedYZoqFwmOW5tY"
genai.configure(api_key=GOOGLE_API_KEY)

# Define other necessary functions (find_audio_files, youtube_to_mp3, chunk_audio, transcribe_audio, summarize) before this
# ...

def summarize_youtube_video(youtube_url, outputs_dir):
    raw_audio_dir = f"{outputs_dir}/raw_audio/"
    chunks_dir = f"{outputs_dir}/chunks"
    transcripts_file = f"{outputs_dir}/transcripts.txt"
    summary_file = f"{outputs_dir}/summary.txt"
    segment_length = 10 * 60  # chunk to 10 minute segments

    if os.path.exists(outputs_dir):
        shutil.rmtree(outputs_dir)  # Delete old files/folder if exists
        os.mkdir(outputs_dir)

    # Download the video using yt-dlp
    audio_filename = youtube_to_mp3(youtube_url, output_dir=raw_audio_dir)

    # Chunk the audio file
    chunked_audio_files = chunk_audio(audio_filename, segment_length=segment_length, output_dir=chunks_dir)

    # Transcribe audio files to text
    transcriptions = transcribe_audio(chunked_audio_files, transcripts_file)

    # Summarize transcriptions
    system_prompt = """
    You are a helpful assistant that summarizes youtube videos.
    You are provided chunks of raw audio that were transcribed from the video's audio.
    Summarize the current chunk to succinct and clear bullet points of its contents.
    """
    summaries = summarize(transcriptions, system_prompt=system_prompt, output_file=summary_file)

    # Create TL;DR summary
    system_prompt_tldr = """
    You are a helpful assistant that summarizes youtube videos.
    Someone has already summarized the video to key points.
    Summarize the key points to one or two sentences that capture the essence of the video.
    """
    long_summary = "\n".join(summaries)
    short_summary = summarize([long_summary], system_prompt=system_prompt_tldr, output_file=summary_file)[0]

    return long_summary, short_summary


# Example use
youtube_url = "https://www.youtube.com/watch?v=g1pb2aK2we4"
outputs_dir = "outputs/"

long_summary, short_summary = summarize_youtube_video(youtube_url, outputs_dir)

# Print the summaries
print("Summaries:")
print("=" * 80)
print("Long summary:")
print("=" * 80)
print(long_summary)
print()

print("=" * 80)
print("Video - TL;DR")
print("=" * 80)
print(short_summary)

[debug] Encodings: locale UTF-8, fs utf-8, pref UTF-8, out UTF-8 (No ANSI), error UTF-8 (No ANSI), screen UTF-8 (No ANSI)
[debug] yt-dlp version stable@2024.12.23 from yt-dlp/yt-dlp [65cf46cdd] (pip) API
[debug] params: {'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}], 'outtmpl': 'outputs//raw_audio/%(title)s.%(ext)s', 'verbose': True, 'compat_opts': set(), 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Sec-Fetch-Mode': 'navigate'}}
[debug] Python 3.10.12 (CPython x86_64 64bit) - Linux-6.1.85+-x86_64-with-glibc2.35 (OpenSSL 3.0.2 15 Mar 2022, glibc 2.35)
[debug] exe versions: ffmpeg 4.4.2 (setts), ffprobe 4.4.2
[debug] Optional libraries: certifi-2024.12.14, requests-2.32.3, secretstorage-3.

Downloading video from https://www.youtube.com/watch?v=g1pb2aK2we4
[youtube] Extracting URL: https://www.youtube.com/watch?v=g1pb2aK2we4
[youtube] g1pb2aK2we4: Downloading webpage
[youtube] g1pb2aK2we4: Downloading ios player API JSON
[youtube] g1pb2aK2we4: Downloading mweb player API JSON


[debug] [youtube] g1pb2aK2we4: ios client https formats require a PO Token which was not provided. They will be skipped as they may yield HTTP Error 403. You can manually pass a PO Token for this client with --extractor-args "youtube:po_token=ios+XXX. For more information, refer to  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#po-token-guide . To enable these broken formats anyway, pass --extractor-args "youtube:formats=missing_pot"
[debug] Loading youtube-nsig.03dbdfab from cache
[debug] [youtube] Decrypted nsig pdXWIpxBD6Gj8NuX5 => -BVKB_bikV8Edw
[debug] Loading youtube-nsig.03dbdfab from cache
[debug] [youtube] Decrypted nsig uNU0SXiYdo8c662Y3 => sl-Fq8v367TI9w


[youtube] g1pb2aK2we4: Downloading m3u8 information


[debug] Sort order given by extractor: quality, res, fps, hdr:12, source, vcodec, channels, acodec, lang, proto
[debug] Formats sorted by: hasvid, ie_pref, quality, res, fps, hdr:12(7), source, vcodec, channels, acodec, lang, proto, size, br, asr, vext, aext, hasaud, id


[info] g1pb2aK2we4: Downloading 1 format(s): 251


[debug] Invoking http downloader on "https://rr2---sn-npoe7ns6.googlevideo.com/videoplayback?expire=1735923703&ei=l8N3Z7mOO-PVssUP29X_4AM&ip=34.142.152.38&id=o-ANdAOr3qZrNXv2cF06mosrweZA6gSVaV8rUO898K83FS&itag=251&source=youtube&requiressl=yes&xpc=EgVo2aDSNQ%3D%3D&met=1735902103%2C&mh=JU&mm=31%2C26&mn=sn-npoe7ns6%2Csn-30a7rnek&ms=au%2Conr&mv=m&mvi=2&pl=20&rms=au%2Cau&initcwndbps=1821250&bui=AfMhrI88spFZRTdmrJlTA5jyTA1xGSMEOXjJI4ykzmmwCVi8P9RUYoG1K2_H12gvTdl2mTMf-LB34H8V&vprv=1&svpuc=1&mime=audio%2Fwebm&ns=dZelpeMmashvD8rznYN-s4gQ&rqh=1&gir=yes&clen=5840382&dur=302.621&lmt=1727819059976703&mt=1735901818&fvip=5&keepalive=yes&fexp=51326932%2C51331020%2C51335594%2C51371294&c=MWEB&sefc=1&txp=4532434&n=sl-Fq8v367TI9w&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cxpc%2Cbui%2Cvprv%2Csvpuc%2Cmime%2Cns%2Crqh%2Cgir%2Cclen%2Cdur%2Clmt&sig=AJfQdSswRgIhALvbP69rC40tw2Zhy_hX08WB7yi9vt9XZPgocYRsqMsTAiEA0XW9sFFfh3goPO6ggBNg3DUt4Cv1r2W1gCOMkEo5JOY%3D&lsparams=met%2Cmh%2Cmm%2Cmn%2Cms%2Cmv%2

[download] Destination: outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.webm
[download] 100% of    5.57MiB in 00:00:00 at 23.65MiB/s  


[debug] ffmpeg command line: ffprobe -show_streams 'file:outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.webm'


[ExtractAudio] Destination: outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.mp3


[debug] ffmpeg command line: ffmpeg -y -loglevel repeat+info -i 'file:outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.webm' -vn -acodec libmp3lame -b:a 192.0k -movflags +faststart 'file:outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.mp3'


Deleting original file outputs//raw_audio/How stretching actually changes your muscles - Malachy McHugh.webm (pass -k to keep)
Chunking audio into 600-second segments...
Total duration: 302.60 seconds. Creating 1 chunks...
Saved segment 1/1 to outputs//chunks/segment_0.mp3
Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)


Processing file: outputs//chunks/segment_0.mp3
Error during summary generation: module 'google.generativeai' has no attribute 'generate_text'
Error during summary generation: module 'google.generativeai' has no attribute 'generate_text'


IndexError: list index out of range

In [None]:
youtube_url = "https://www.youtube.com/watch?v=Yf1o0TQzry8"
outputs_dir = "outputs/"

long_summary, short_summary = summarize_youtube_video(youtube_url, outputs_dir)

print("Summaries:")
print("=" * 80)
print("Long summary:")
print("=" * 80)
print(long_summary)
print()

print("=" * 80)
print("Video - TL;DR")
print("=" * 80)
print(short_summary)