In [None]:
# YouTube Transcript Processing with LLMs

## Install Dependencies

!uv add youtube-transcript-api

In [None]:
## Import Libraries

import re
from openai import OpenAI
import requests
import json
from minsearch import Index, VectorSearch
import pickle 
from youtube_transcript_api import YouTubeTranscriptApi
from typing import Any, Dict, List, Optional

client = OpenAI()

In [None]:
## Load Pre-downloaded YouTube Transcript

video_id = 'ph1PxZIkz1o'

# Load the cached transcript (pre-fetched to avoid API calls)
with open(f'{video_id}.bin', 'rb') as f_in:
    transcript = pickle.load(f_in)

In [None]:
## Format Transcript as Subtitles

def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour else MM:SS"""
    total_seconds =  int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"
    
def make_subtitles(transcript) -> str:
    """Convert transcript to timestamped text format"""
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)
    
    return '\n'.join(lines)

In [None]:
# Preview the formatted subtitles
subtitles = make_subtitles(transcript)
print(subtitles[:500])

In [None]:
## Define Instructions for Summarization

instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

Output format: 

<OUTPUT>
Summary

timestamp chapter 
timestamp chapter
...
timestamp chapter
</OUTPUT>
"""

In [None]:
## Helper Function for LLM Calls

def llm(
    user_prompt: str,
    *,
    client: OpenAI,
    instructions: Optional[str] = None,
    model: str = "gpt-4o-mini",
) -> str:
    """Call the OpenAI Responses API with optional system instructions"""
    messages = []
    if instructions:
        messages.append({"role": "system", "content": instructions})
    messages.append({"role": "user", "content": user_prompt})

    resp = client.responses.create(model=model, input=messages)
    return resp.output_text

In [None]:
## Generate Summary (Unstructured Output)

answer = llm(subtitles, client=client, instructions=instructions)
print(answer)

In [None]:
# Helper to strip outer XML tags
def strip_matching_outer_html_tags(text: str) -> str:
    match = re.match(r"^\s*<(\w+)[^>]*>\s*(.*?)\s*</\1>\s*$", text, re.DOTALL)
    if match:
        return match.group(2).strip()
    return text.strip()

answer = strip_matching_outer_html_tags(answer)

In [None]:
## Define Pydantic Models for Structured Outputs

from pydantic import BaseModel

class Chapter(BaseModel):
    timestamp: str
    title: str

class YTSummaryResponse(BaseModel):
    summary: str
    chapters: list[Chapter]

In [None]:
## Generate Structured Summary with Pydantic

instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

More chapters is better than fewer chapters. Have a chapter at least every 3-5 minutes
""".strip()

messages = [
    {"role": "system", "content": instructions}, 
    {"role": "user", "content": subtitles}
]

response = client.responses.parse(
    model='gpt-4o-mini',
    input=messages,
    text_format=YTSummaryResponse
)

In [None]:
## Display the Structured Results

summary = response.output[0].content[0].parsed

print(summary.summary)
print()
for c in summary.chapters:
    print(c.timestamp, c.title)

In [None]:
## Create Reusable Structured LLM Function

def llm_structured(instructions, user_prompt, output_format, client, model="gpt-4o-mini"):
    """Call OpenAI with structured Pydantic output"""
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = client.responses.parse(
        model=model,
        input=messages,
        text_format=output_format
    )

    return response.output[0].content[0].parsed

In [None]:
# Use the helper function
summary = llm_structured(
    instructions=instructions,
    user_prompt=subtitles,
    client=client,
    output_format=YTSummaryResponse
)

In [None]:
# View the structured result
summary