<a href="https://colab.research.google.com/github/jessica1105ctrl/Youtube-Summarizer/blob/main/YoutubeSummarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ================== 📦 1. Install once per runtime ==================
!pip install --quiet youtube-transcript-api transformers

# ================== 🧑‍💻 2. Imports & model load ==================
import re
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",   # swap for a lighter model if needed
    device_map="auto"                  # GPU in Colab, CPU elsewhere
)

# ================== 🔍 3. Helper functions ==================
def extract_video_id(url:str) -> str|None:
    """
    Works for long and short YouTube URLs.
    """
    match = re.search(r"(?:v=|youtu\.be/)([\w\-]{11})", url)
    return match.group(1) if match else None


def fetch_transcript_text(video_id:str, lang_priority:list[str]=["en", "en-US"]) -> str:
    """
    Uses the new v1.x API:
       ytt_api = YouTubeTranscriptApi()
       ytt_api.fetch(video_id).to_raw_data()
    """
    ytt_api = YouTubeTranscriptApi()
    try:
        # pick the first matching language transcript
        transcript_obj = ytt_api.list(video_id).find_transcript(lang_priority)
        raw_segments   = transcript_obj.fetch().to_raw_data()
    except Exception as e:
        raise RuntimeError(f"Transcript fetch failed: {e}")

    return " ".join(seg["text"] for seg in raw_segments)


def summarize_long_text(text:str,
                        chunk_chars:int=4000,
                        max_len:int=150,
                        min_len:int=30) -> str:
    """
    Splits very long transcripts into manageable chunks, summarises each,
    then summarises the summaries.
    """
    summaries = []
    for start in range(0, len(text), chunk_chars):
        chunk = text[start : start + chunk_chars]
        summary = summarizer(
            chunk,
            max_length=max_len,
            min_length=min_len,
            do_sample=False
        )[0]["summary_text"]
        summaries.append(summary)

    # If we produced multiple partial summaries, compress them once more
    return (summaries[0] if len(summaries) == 1
            else summarizer(" ".join(summaries),
                            max_length=max_len,
                            min_length=min_len,
                            do_sample=False)[0]["summary_text"])


def summarize_youtube(url:str) -> str:
    video_id = extract_video_id(url)
    if not video_id:
        return "❌ Could not extract a valid YouTube video ID from that link."

    try:
        transcript_text = fetch_transcript_text(video_id)
    except RuntimeError as err:
        return f"❌ {err}"

    return summarize_long_text(transcript_text)

# ================== 🚀 4. Example usage ==================
if __name__ == "__main__":
    link = input("Paste a YouTube link: ").strip()
    print("\n📝 Summary:\n")
    print(summarize_youtube(link))
v

Device set to use cpu


Paste a YouTube link: https://www.youtube.com/watch?v=6uUblznfrsk

📝 Summary:

Mahendra and Mahendra was founded by Two Brothers around the time of Independence. Winston Churchill did not expect India to survive or to mold itself into a viable country. Three decades later here we are a multi-billion dollar group still chugging along.
