In [1]:
from rich import print
from dotenv import load_dotenv
import os
from youtube_transcript_api import YouTubeTranscriptApi
from typing import List, Dict, Any, Generator
from langchain.schema import Document
import googleapiclient.discovery


load_dotenv()
%load_ext rich

In [2]:
youtube = googleapiclient.discovery.build(
    "youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY")
)

In [3]:
search_result = (
    youtube.search()
    .list(
        part="snippet",
        q="how to make a website",
        maxResults=15,
        type="video",
        videoCaption="closedCaption",
    )
    .execute()
)

In [9]:
search_result


[1m{[0m
    [32m'kind'[0m: [32m'youtube#searchListResponse'[0m,
    [32m'etag'[0m: [32m'nbXPlK3wRdR2RRmb0bHnm9GlByc'[0m,
    [32m'nextPageToken'[0m: [32m'CA8QAA'[0m,
    [32m'regionCode'[0m: [32m'CA'[0m,
    [32m'pageInfo'[0m: [1m{[0m[32m'totalResults'[0m: [1;36m1000000[0m, [32m'resultsPerPage'[0m: [1;36m15[0m[1m}[0m,
    [32m'items'[0m: [1m[[0m
        [1m{[0m
            [32m'kind'[0m: [32m'youtube#searchResult'[0m,
            [32m'etag'[0m: [32m'Kcl7CHFEZPjkijGPOj8ZTFwyw8o'[0m,
            [32m'id'[0m: [1m{[0m[32m'kind'[0m: [32m'youtube#video'[0m, [32m'videoId'[0m: [32m'YWA-xbsJrVg'[0m[1m}[0m,
            [32m'snippet'[0m: [1m{[0m
                [32m'publishedAt'[0m: [32m'2018-01-03T15:32:33Z'[0m,
                [32m'channelId'[0m: [32m'UCpWT_QfKk7BJIpn709YgsYA'[0m,
                [32m'title'[0m: [32m'How to Make a Website in 10 mins - Simple &amp; Easy'[0m,
                [32m'description'[0m: [32m

In [4]:
class YouTubeVideo:
    def __init__(
        self,
        video_id,
        title,
        description,
        published_at,
        channel_title,
        chunk_time_limit=120,
    ):
        self.video_id = video_id
        self.title = title
        self.description = description
        self.published_at = published_at
        self.channel_title = channel_title
        self.chunk_time_limit = chunk_time_limit

    def __repr__(self):
        return f"{self.title} by {self.channel_title} - {self.video_id}"

    def _get_transcript(self):
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
            for t in transcript_list:
                if t.language_code in ["en", "en-US"]:
                    return t.fetch()
                elif any(
                    lang["language_code"] == "en" for lang in t.translation_languages
                ):
                    return t.translate("en").fetch()
        except Exception as e:
            print(f"Error fetching transcript: {e} for video {self.video_id}")
        return None

    def _make_chunk_document(
        self, chunk_pieces: List[Dict], chunk_start_seconds: int
    ) -> Document:
        """Create Document from chunk of transcript pieces."""
        m, s = divmod(chunk_start_seconds, 60)
        h, m = divmod(m, 60)
        return Document(
            page_content=" ".join(
                map(lambda chunk_piece: chunk_piece["text"].strip(" "), chunk_pieces)
            ),
            metadata={
                "start_seconds": chunk_start_seconds,
                "start_timestamp": f"{h:02d}:{m:02d}:{s:02d}",
                "source": f"https://www.youtube.com/watch?v={self.video_id}&t={chunk_start_seconds}s",
                "title": self.title,
                "video_id": self.video_id,
            },
        )

    def _get_transcript_chunks(
        self, transcript_pieces: List[Dict]
    ) -> Generator[Document, None, None]:
        chunk_pieces: List[Dict[str, Any]] = []
        chunk_start_seconds = 0
        chunk_time_limit = self.chunk_time_limit
        for transcript_piece in transcript_pieces:
            piece_end = transcript_piece["start"] + transcript_piece["duration"]
            if piece_end > chunk_time_limit:
                if chunk_pieces:
                    yield self._make_chunk_document(chunk_pieces, chunk_start_seconds)
                chunk_pieces = []
                chunk_start_seconds = chunk_time_limit
                chunk_time_limit += self.chunk_time_limit

            chunk_pieces.append(transcript_piece)

        if len(chunk_pieces) > 0:
            yield self._make_chunk_document(chunk_pieces, chunk_start_seconds)

    def load(self):
        try:
            transcripts = self._get_transcript()
            if transcripts:
                return list(self._get_transcript_chunks(transcripts))
        except Exception as e:
            print(f"Error: {e} for video {self.video_id}")

        return []

In [5]:
items = search_result["items"]
yt_videos = [
    YouTubeVideo(
        video_id=item["id"]["videoId"],
        title=item["snippet"]["title"],
        description=item["snippet"]["description"],
        published_at=item["snippet"]["publishedAt"],
        channel_title=item["snippet"]["channelTitle"],
        chunk_time_limit=30,
    )
    for item in items
]

In [7]:
transcripts = [v.load() for v in yt_videos]
transcripts



[1m[[0m
    [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m
                [32m'start_seconds'[0m: [1;36m0[0m,
                [32m'start_timestamp'[0m: [32m'00:00:00'[0m,
                [32m'source'[0m: [32m'https://www.youtube.com/watch?[0m[32mv[0m[32m=[0m[32mYWA[0m[32m-xbsJrVg&[0m[32mt[0m[32m=[0m[32m0s[0m[32m'[0m,
                [32m'title'[0m: [32m'How to Make a Website in 10 mins - Simple &amp; Easy'[0m,
                [32m'video_id'[0m: [32m'YWA-xbsJrVg'[0m
            [1m}[0m,
            [33mpage_content[0m=[32m"Hi[0m[32m guys, I'm Subhang from WebsiteLearners.com and in this video I'm\ngoing to show you how you can quickly make a\nwebsite. [0m[32m([0m[32min just 10 mins[0m[32m)[0m[32m Now after watching this video,\nyou will be able to make any kind of website just like\nthis, by using drag & drop. So, don't miss this video out and watch it till the end, to learn how to do it. Okay

## LangGraph

In [None]:
from langchain_openai import ChatOpenAI

