## Imports 

In [8]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import re

### Transcription

In [9]:
#Extracts video ID from different YouTube URL formats.

def extract_video_id(url):
    # We use Regex to hunt for the 11-character ID after 'v=' or 'youtu.be/'
    match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

In [10]:
#Fetch transcript

def get_transcript(video_id):
    try:
        api = YouTubeTranscriptApi()
        # The .fetch method grabs the subtitle object list
        transcript = api.fetch(video_id)
        # We join the list into a single long string of text
        return " ".join([t.text for t in transcript])

    except TranscriptsDisabled:
        return "Error: Transcripts are disabled for this video."
    except NoTranscriptFound:
        return "Error: No transcript found for this video."
    except Exception as e:
        return f"Error: {str(e)}"

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Check if we have a GPU (CUDA) available to speed things up
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "google/flan-t5-base"

# Load the tokenizer (translates text to numbers)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model (the neural network) and move it to the GPU/CPU
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Loading weights: 100%|██████████| 282/282 [00:00<00:00, 492.99it/s, Materializing param=shared.weight]                                                       


In [None]:
def summarize_chunk(text_chunk):
    # We give the model a specific instruction (prompt engineering)
    prompt = f"Summarize the following text clearly:\n{text_chunk}"

    # Convert text to tensor numbers (inputs)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    # Generate the summary
    summary_ids = model.generate(
        **inputs,
        max_new_tokens=120,    # Max length of the summary
        num_beams=4,           # Look for the 4 best paths (higher quality)
        length_penalty=1.0,    # Balance between short and long
        early_stopping=True
    )


    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)