## Imports 

In [8]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import re

### Transcription

In [9]:
#Extracts video ID from different YouTube URL formats.

def extract_video_id(url):
    # We use Regex to hunt for the 11-character ID after 'v=' or 'youtu.be/'
    match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

In [10]:
#Fetch transcript

def get_transcript(video_id):
    try:
        api = YouTubeTranscriptApi()
        # The .fetch method grabs the subtitle object list
        transcript = api.fetch(video_id)
        # We join the list into a single long string of text
        return " ".join([t.text for t in transcript])

    except TranscriptsDisabled:
        return "Error: Transcripts are disabled for this video."
    except NoTranscriptFound:
        return "Error: No transcript found for this video."
    except Exception as e:
        return f"Error: {str(e)}"

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Check if we have a GPU (CUDA) available to speed things up
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "google/flan-t5-base"

# Load the tokenizer (translates text to numbers)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model (the neural network) and move it to the GPU/CPU
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 282/282 [00:00<00:00, 492.99it/s, Materializing param=shared.weight]                                                       


In [None]:
def summarize_chunk(text_chunk):
    # We give the model a specific instruction (prompt engineering)
    prompt = f"Summarize the following text clearly:\n{text_chunk}"

    # Convert text to tensor numbers (inputs)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    # Generate the summary
    summary_ids = model.generate(
        **inputs,
        max_new_tokens=120,    # Max length of the summary
        num_beams=4,           # Look for the 4 best paths (higher quality)
        length_penalty=1.0,    # Balance between short and long
        early_stopping=True
    )


    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [17]:
def chunk_text(text, chunk_size=1200):
    sentences = text.split(". ")
    chunks, current_chunk = [], ""

    for sentence in sentences:
        # Check if adding the next sentence exceeds our limit
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + ". "
        else:
            # If full, seal the chunk and start a new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [18]:
def generate_video_notes(video_url):
    print(f"\nüé¨ Processing video: {video_url}")

    video_id = extract_video_id(video_url)
    if not video_id:
        print("Invalid YouTube URL.")
        return

    print("üéß Fetching transcript...")
    transcript = get_transcript(video_id)

    if transcript.startswith("Error"):
        print(transcript)
        return

    print("üî™ Chunking transcript...")
    chunks = chunk_text(transcript)
    print(f"   -> {len(chunks)} chunks created.")

    print("üß† Generating AI notes...")
    notes = []

    # Loop through chunks and summarize each one
    for i, chunk in enumerate(chunks):
        print(f"   Summarizing chunk {i+1}/{len(chunks)}...")
        summary = summarize_chunk(chunk)
        notes.append(f"- {summary}")

    print("\n" + "="*50)
    print("üìù AI GENERATED NOTES")
    print("="*50)
    print("\n".join(notes))


if __name__ == "__main__":
    url = input("Paste YouTube URL: ")
    generate_video_notes(url)


üé¨ Processing video: https://youtu.be/yIYKR4sgzI8?si=psJL0SNbw8rU8014
üéß Fetching transcript...
üî™ Chunking transcript...
   -> 6 chunks created.
üß† Generating AI notes...
   Summarizing chunk 1/6...
   Summarizing chunk 2/6...
   Summarizing chunk 3/6...
   Summarizing chunk 4/6...
   Summarizing chunk 5/6...
   Summarizing chunk 6/6...

üìù AI GENERATED NOTES
- Stat Quest Hello, I'm Josh Stormer and welcome to Stat Quest today we're going to talk about logistic regression. This is a technique that can be used for traditional statistics
- We talked a little bit about linear regression in Another stat quest, We had some data Weight and size then, we fit a line to it and With that line, we could do a lot
- Know that it refers to different types of mice. Know that logistic regression is similar to linear regression except logistic regression fits an s-shaped logistic function
- Regression can work with continuous data, like weight and age and discrete data like genotype and ast