In [68]:
from openai import OpenAI

openai_client = OpenAI()

In [69]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [71]:
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
video_id = 'ph1PxZIkz1o'
ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id)

In [75]:
import pickle

In [76]:
with open(f'{video_id}.bin', 'rb') as f_in:
    transcript = pickle.load(f_in)

In [77]:
transcript[:10]

[FetchedTranscriptSnippet(text='So hi everyone. Uh today we are going to', start=0.0, duration=5.04),
 FetchedTranscriptSnippet(text='talk about our upcoming course. The', start=2.96, duration=3.52),
 FetchedTranscriptSnippet(text='upcoming course is called machine', start=5.04, duration=5.92),
 FetchedTranscriptSnippet(text='learning zoom camp. And um this is', start=6.48, duration=5.92),
 FetchedTranscriptSnippet(text='already I put the link in the', start=10.96, duration=3.599),
 FetchedTranscriptSnippet(text="description. So if you're watching um", start=12.4, duration=4.719),
 FetchedTranscriptSnippet(text="this video in recording or you're", start=14.559, duration=4.88),
 FetchedTranscriptSnippet(text='watching it live, you go here in the', start=17.119, duration=4.561),
 FetchedTranscriptSnippet(text='description after under this video and', start=19.439, duration=5.6),
 FetchedTranscriptSnippet(text='then you see a link course. uh click on', start=21.68, duration=6.24)]

In [83]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [84]:
subtitles = make_subtitles(transcript)

In [85]:
print(subtitles[:500])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub


In [91]:
instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

Output format: 

<OUTPUT>
Summary

timestamp chapter 
timestamp chapter
...
timestamp chapter
</OUTPUT>

Don't include <OUTPUT> in the output
"""

In [92]:
answer = llm(subtitles, instructions=instructions)

In [94]:
from pydantic import BaseModel

In [95]:
class Chapter(BaseModel):
    timestamp: str
    title: str

class YTSummaryResponse(BaseModel):
    summary: str
    chapters: list[Chapter]


In [96]:
def llm_structured(instructions, user_prompt, output_type, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type
    )

    return response.output_parsed

In [98]:
summary = llm_structured(
    instructions=instructions,
    user_prompt=subtitles,
    output_type=YTSummaryResponse
)

In [102]:
print(summary.summary)
print()
for c in summary.chapters:
    print(c.timestamp, c.title)

The video discusses the upcoming 'Machine Learning Zoom Camp', a course designed primarily for aspiring machine learning engineers and data scientists. It provides details about course structure, prerequisites, and the skills covered. Key topics include the course's focus on machine learning engineering rather than data science, updates to existing content, and the emphasis on practical skills necessary for job readiness. Viewers are encouraged to ask questions throughout the livestream, fostering an interactive environment. Additionally, the series will feature three projects required for getting a certificate. Information on the use of external resources and tools, like GitHub Codespaces, and the significance of hands-on learning through projects are heavily emphasized.

0:00 Introduction to the course
0:55 Course registration details
1:56 Course updates and content structure
3:40 Job placement opportunities
5:15 Overview of computer vision content
6:06 Prerequisites for the course
1

## RAG

In [48]:
print(subtitles[:1000])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub page.
0:34 This GitHub page is the main entry point
0:36 to our course and um yeah I think it's
0:41 more or less self-explanatory. If you
0:43 want to sign up this is the button you
0:45 click and the actual course starts in on
0:48 September 15th. it means that it's uh
0:51 slightly less than one one month before
0:53 the course starts and the purpose of
0:55 today's um session is to just answer
0:58 your questions. So you have some
1:00 questions and uh you can ask these
1:03 questions using

In [50]:
def sliding_window(seq, size, step):
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []

    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result

In [55]:
chunk = transcript[:10]

In [57]:
def join_lines(transcript) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)


def format_chunk(chunk):
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }

In [61]:
chunks = []

for chunk in sliding_window(transcript, 60, 30):
    processed = format_chunk(chunk)
    chunks.append(processed)

Created 46 chunks


In [62]:
print(f"Created {len(chunks)} chunks")

Created 46 chunks


In [64]:
from minsearch import Index

index = Index(text_fields=["text"])
index.fit(chunks)

<minsearch.minsearch.Index at 0x7b5028f730b0>

In [67]:
results = index.search('Can I find a job after the course?', num_results=5)

In [66]:
results

[{'start': '52:34',
  'end': '55:07',
  'text': "project I submitted was a fake course project. So there was nothing that's why I didn't get any points. Uh the reason I got uh nine uh is uh cuz I evaluated other peers. So that's why um like for each evalation I get three points. But this is how it's done. So the we evaluate projects by doing peer review and peer review is mandatory to complete the project. So if you submit a project but you don't do peer reviewing you fail the project and if you fail a project you fail the course. Right? So this very important to do peer reviews. Uh will the course make one job ready? Yes. If you put effort in the the the course and if you make a good project, if you also follow our recommendations to learn in public, this will definitely make you job ready. Uh what's the next path to follow after the completing the course? Uh to step into advanced stuff, find a job. That's the best way. Um cuz you can do courses forever, but I think you need to work o