diff --git a/docs/docs/integrations/document_loaders/youtube_transcript.ipynb b/docs/docs/integrations/document_loaders/youtube_transcript.ipynb index d26dd1eb8ee757..77ed5a4016395f 100644 --- a/docs/docs/integrations/document_loaders/youtube_transcript.ipynb +++ b/docs/docs/integrations/document_loaders/youtube_transcript.ipynb @@ -15,47 +15,45 @@ }, { "cell_type": "code", - "execution_count": null, "id": "427d5745", "metadata": {}, + "source": "from langchain_community.document_loaders import YoutubeLoader", "outputs": [], - "source": [ - "from langchain_community.document_loaders import YoutubeLoader" - ] + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "34a25b57", "metadata": { "scrolled": true }, - "outputs": [], "source": [ "%pip install --upgrade --quiet youtube-transcript-api" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "bc8b308a", "metadata": {}, - "outputs": [], "source": [ "loader = YoutubeLoader.from_youtube_url(\n", " \"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=False\n", ")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "d073dd36", "metadata": {}, - "outputs": [], "source": [ "loader.load()" - ] + ], + "outputs": [], + "execution_count": null }, { "attachments": {}, @@ -68,26 +66,26 @@ }, { "cell_type": "code", - "execution_count": null, "id": "ba28af69", "metadata": {}, - "outputs": [], "source": [ "%pip install --upgrade --quiet pytube" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "9b8ea390", "metadata": {}, - "outputs": [], "source": [ "loader = YoutubeLoader.from_youtube_url(\n", " \"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True\n", ")\n", "loader.load()" - ] + ], + "outputs": [], + "execution_count": null }, { "attachments": {}, @@ -104,10 +102,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "08510625", "metadata": {}, - "outputs": [], "source": [ "loader = YoutubeLoader.from_youtube_url(\n", " \"https://www.youtube.com/watch?v=QsYGlZkevEg\",\n", @@ -116,7 +112,41 @@ " translation=\"en\",\n", ")\n", "loader.load()" - ] + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Get transcripts as timestamped chunks\n", + "\n", + "Get one or more `Document` objects, each containing a chunk of the video transcript. The length of the chunks, in seconds, may be specified. Each chunk's metadata includes a URL of the video on YouTube, which will start the video at the beginning of the specific chunk.\n", + "\n", + "`transcript_format` param: One of the `langchain_community.document_loaders.youtube.TranscriptFormat` values. In this case, `TranscriptFormat.CHUNKS`.\n", + "\n", + "`chunk_size_seconds` param: An integer number of video seconds to be represented by each chunk of transcript data. Default is 120 seconds." + ], + "id": "69f4e399a9764d73" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "from langchain_community.document_loaders.youtube import TranscriptFormat\n", + "\n", + "loader = YoutubeLoader.from_youtube_url(\n", + " \"https://www.youtube.com/watch?v=TKCMw0utiak\",\n", + " add_video_info=True,\n", + " transcript_format=TranscriptFormat.CHUNKS,\n", + " chunk_size_seconds=30,\n", + ")\n", + "print(\"\\n\\n\".join(map(repr, loader.load())))" + ], + "id": "540bbf19182f38bc", + "outputs": [], + "execution_count": null }, { "attachments": {}, @@ -142,10 +172,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "c345bc43", "metadata": {}, - "outputs": [], "source": [ "# Init the GoogleApiClient\n", "from pathlib import Path\n", @@ -170,7 +198,9 @@ "\n", "# returns a list of Documents\n", "youtube_loader_channel.load()" - ] + ], + "outputs": [], + "execution_count": null } ], "metadata": { diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py index c1cea0b2dc3208..8f19471040cc2c 100644 --- a/libs/community/langchain_community/document_loaders/youtube.py +++ b/libs/community/langchain_community/document_loaders/youtube.py @@ -4,7 +4,7 @@ import logging from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import Any, Dict, Generator, List, Optional, Sequence, Union from urllib.parse import parse_qs, urlparse from langchain_core.documents import Document @@ -99,8 +99,8 @@ def _load_credentials(self) -> Any: return creds -ALLOWED_SCHEMAS = {"http", "https"} -ALLOWED_NETLOCK = { +ALLOWED_SCHEMES = {"http", "https"} +ALLOWED_NETLOCS = { "youtu.be", "m.youtube.com", "youtube.com", @@ -111,13 +111,13 @@ def _load_credentials(self) -> Any: def _parse_video_id(url: str) -> Optional[str]: - """Parse a youtube url and return the video id if valid, otherwise None.""" + """Parse a YouTube URL and return the video ID if valid, otherwise None.""" parsed_url = urlparse(url) - if parsed_url.scheme not in ALLOWED_SCHEMAS: + if parsed_url.scheme not in ALLOWED_SCHEMES: return None - if parsed_url.netloc not in ALLOWED_NETLOCK: + if parsed_url.netloc not in ALLOWED_NETLOCS: return None path = parsed_url.path @@ -141,14 +141,15 @@ def _parse_video_id(url: str) -> Optional[str]: class TranscriptFormat(Enum): - """Transcript format.""" + """Output formats of transcripts from `YoutubeLoader`.""" TEXT = "text" LINES = "lines" + CHUNKS = "chunks" class YoutubeLoader(BaseLoader): - """Load `YouTube` transcripts.""" + """Load `YouTube` video transcripts.""" def __init__( self, @@ -158,9 +159,11 @@ def __init__( translation: Optional[str] = None, transcript_format: TranscriptFormat = TranscriptFormat.TEXT, continue_on_failure: bool = False, + chunk_size_seconds: int = 120, ): """Initialize with YouTube video ID.""" self.video_id = video_id + self._metadata = {"source": video_id} self.add_video_info = add_video_info self.language = language if isinstance(language, str): @@ -170,25 +173,69 @@ def __init__( self.translation = translation self.transcript_format = transcript_format self.continue_on_failure = continue_on_failure + self.chunk_size_seconds = chunk_size_seconds @staticmethod def extract_video_id(youtube_url: str) -> str: - """Extract video id from common YT urls.""" + """Extract video ID from common YouTube URLs.""" video_id = _parse_video_id(youtube_url) if not video_id: raise ValueError( - f"Could not determine the video ID for the URL {youtube_url}" + f'Could not determine the video ID for the URL "{youtube_url}".' ) return video_id @classmethod def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: - """Given youtube URL, load video.""" + """Given a YouTube URL, construct a loader. + See `YoutubeLoader()` constructor for a list of keyword arguments. + """ video_id = cls.extract_video_id(youtube_url) return cls(video_id, **kwargs) + def _make_chunk_document( + self, chunk_pieces: List[Dict], chunk_start_seconds: int + ) -> Document: + """Create Document from chunk of transcript pieces.""" + m, s = divmod(chunk_start_seconds, 60) + h, m = divmod(m, 60) + return Document( + page_content=" ".join( + map(lambda chunk_piece: chunk_piece["text"].strip(" "), chunk_pieces) + ), + metadata={ + **self._metadata, + "start_seconds": chunk_start_seconds, + "start_timestamp": f"{h:02d}:{m:02d}:{s:02d}", + "source": + # replace video ID with URL to start time + f"https://www.youtube.com/watch?v={self.video_id}" + f"&t={chunk_start_seconds}s", + }, + ) + + def _get_transcript_chunks( + self, transcript_pieces: List[Dict] + ) -> Generator[Document, None, None]: + chunk_pieces: List[Dict[str, Any]] = [] + chunk_start_seconds = 0 + chunk_time_limit = self.chunk_size_seconds + for transcript_piece in transcript_pieces: + piece_end = transcript_piece["start"] + transcript_piece["duration"] + if piece_end > chunk_time_limit: + if chunk_pieces: + yield self._make_chunk_document(chunk_pieces, chunk_start_seconds) + chunk_pieces = [] + chunk_start_seconds = chunk_time_limit + chunk_time_limit += self.chunk_size_seconds + + chunk_pieces.append(transcript_piece) + + if len(chunk_pieces) > 0: + yield self._make_chunk_document(chunk_pieces, chunk_start_seconds) + def load(self) -> List[Document]: - """Load documents.""" + """Load YouTube transcripts into `Document` objects.""" try: from youtube_transcript_api import ( NoTranscriptFound, @@ -197,17 +244,15 @@ def load(self) -> List[Document]: ) except ImportError: raise ImportError( - "Could not import youtube_transcript_api python package. " + 'Could not import "youtube_transcript_api" Python package. ' "Please install it with `pip install youtube-transcript-api`." ) - metadata = {"source": self.video_id} - if self.add_video_info: # Get more video meta info # Such as title, description, thumbnail url, publish_date video_info = self._get_video_info() - metadata.update(video_info) + self._metadata.update(video_info) try: transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) @@ -222,31 +267,45 @@ def load(self) -> List[Document]: if self.translation is not None: transcript = transcript.translate(self.translation) - transcript_pieces = transcript.fetch() + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() if self.transcript_format == TranscriptFormat.TEXT: - transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) - return [Document(page_content=transcript, metadata=metadata)] + transcript = " ".join( + map( + lambda transcript_piece: transcript_piece["text"].strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript, metadata=self._metadata)] elif self.transcript_format == TranscriptFormat.LINES: - return [ - Document( - page_content=t["text"].strip(" "), - metadata=dict((key, t[key]) for key in t if key != "text"), + return list( + map( + lambda transcript_piece: Document( + page_content=transcript_piece["text"].strip(" "), + metadata={ + filter( + lambda item: item[0] != "text", transcript_piece.items() + ) + }, + ), + transcript_pieces, ) - for t in transcript_pieces - ] + ) + elif self.transcript_format == TranscriptFormat.CHUNKS: + return list(self._get_transcript_chunks(transcript_pieces)) + else: raise ValueError("Unknown transcript format.") - def _get_video_info(self) -> dict: + def _get_video_info(self) -> Dict: """Get important video information. - Components are: + Components include: - title - description - - thumbnail url, + - thumbnail URL, - publish_date - - channel_author + - channel author - and more. """ try: @@ -254,7 +313,7 @@ def _get_video_info(self) -> dict: except ImportError: raise ImportError( - "Could not import pytube python package. " + 'Could not import "pytube" Python package. ' "Please install it with `pip install pytube`." ) yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}") diff --git a/libs/community/tests/unit_tests/document_loaders/test_youtube.py b/libs/community/tests/unit_tests/document_loaders/test_youtube.py index 942012a1db97d0..10c154f65cb1f7 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_youtube.py +++ b/libs/community/tests/unit_tests/document_loaders/test_youtube.py @@ -1,6 +1,8 @@ import pytest +from langchain_core.documents import Document from langchain_community.document_loaders import YoutubeLoader +from langchain_community.document_loaders.youtube import TranscriptFormat @pytest.mark.parametrize( @@ -25,3 +27,192 @@ def test_video_id_extraction(youtube_url: str, expected_video_id: str) -> None: """Test that the video id is extracted from a youtube url""" assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id + + +def test__get_transcript_chunks() -> None: + test_transcript_pieces = [ + {"text": "♪ Hail to the victors valiant ♪", "start": 3.719, "duration": 5.0}, + {"text": "♪ Hail to the conquering heroes ♪", "start": 8.733, "duration": 5.0}, + {"text": "♪ Hail, hail to Michigan ♪", "start": 14.541, "duration": 5.0}, + {"text": "♪ The leaders and best ♪", "start": 19.785, "duration": 5.0}, + {"text": "♪ Hail to the victors valiant ♪", "start": 25.661, "duration": 4.763}, + {"text": "♪ Hail to the conquering heroes ♪", "start": 30.424, "duration": 5.0}, + {"text": "♪ Hail, hail to Michigan ♪", "start": 36.37, "duration": 4.91}, + {"text": "♪ The champions of the west ♪", "start": 41.28, "duration": 2.232}, + {"text": "♪ Hail to the victors valiant ♪", "start": 43.512, "duration": 4.069}, + { + "text": "♪ Hail to the conquering heroes ♪", + "start": 47.581, + "duration": 4.487, + }, + {"text": "♪ Hail, hail to Michigan ♪", "start": 52.068, "duration": 4.173}, + {"text": "♪ The leaders and best ♪", "start": 56.241, "duration": 4.542}, + {"text": "♪ Hail to victors valiant ♪", "start": 60.783, "duration": 3.944}, + { + "text": "♪ Hail to the conquering heroes ♪", + "start": 64.727, + "duration": 4.117, + }, + {"text": "♪ Hail, hail to Michigan ♪", "start": 68.844, "duration": 3.969}, + {"text": "♪ The champions of the west ♪", "start": 72.813, "duration": 4.232}, + {"text": "(choir clapping rhythmically)", "start": 77.045, "duration": 3.186}, + {"text": "- Go blue!", "start": 80.231, "duration": 0.841}, + {"text": "(choir clapping rhythmically)", "start": 81.072, "duration": 3.149}, + {"text": "Go blue!", "start": 84.221, "duration": 0.919}, + {"text": "♪ It's great to be ♪", "start": 85.14, "duration": 1.887}, + { + "text": "♪ A Michigan Wolverine ♪\n- Go blue!", + "start": 87.027, + "duration": 2.07, + }, + {"text": "♪ It's great to be ♪", "start": 89.097, "duration": 1.922}, + { + "text": "♪ A Michigan Wolverine ♪\n- Go blue!", + "start": 91.019, + "duration": 2.137, + }, + { + "text": "♪ It's great to be ♪\n(choir scatting)", + "start": 93.156, + "duration": 1.92, + }, + { + "text": "♪ a Michigan Wolverine ♪\n(choir scatting)", + "start": 95.076, + "duration": 2.118, + }, + { + "text": "♪ It's great to be ♪\n(choir scatting)", + "start": 97.194, + "duration": 1.85, + }, + { + "text": "♪ A Michigan ♪\n(choir scatting)", + "start": 99.044, + "duration": 1.003, + }, + {"text": "- Let's go blue!", "start": 100.047, "duration": 1.295}, + { + "text": "♪ Hail to the victors valiant ♪", + "start": 101.342, + "duration": 1.831, + }, + { + "text": "♪ Hail to the conquering heroes ♪", + "start": 103.173, + "duration": 2.21, + }, + {"text": "♪ Hail, hail to Michigan ♪", "start": 105.383, "duration": 1.964}, + {"text": "♪ The leaders and best ♪", "start": 107.347, "duration": 2.21}, + { + "text": "♪ Hail to the victors valiant ♪", + "start": 109.557, + "duration": 1.643, + }, + { + "text": "♪ Hail to the conquering heroes ♪", + "start": 111.2, + "duration": 2.129, + }, + {"text": "♪ Hail, hail to Michigan ♪", "start": 113.329, "duration": 2.091}, + {"text": "♪ The champions of the west ♪", "start": 115.42, "duration": 2.254}, + { + "text": "♪ Hail to the victors valiant ♪", + "start": 117.674, + "duration": 4.039, + }, + { + "text": "♪ Hail to the conquering heroes ♪", + "start": 121.713, + "duration": 4.103, + }, + { + "text": "♪ Hail to the blue, hail to the blue ♪", + "start": 125.816, + "duration": 1.978, + }, + { + "text": "♪ Hail to the blue, hail to the blue ♪", + "start": 127.794, + "duration": 2.095, + }, + { + "text": "♪ Hail to the blue, hail to the blue ♪", + "start": 129.889, + "duration": 1.932, + }, + { + "text": "♪ Hail to the blue, hail to the blue ♪", + "start": 131.821, + "duration": 2.091, + }, + { + "text": "♪ Hail to the blue, hail to the blue ♪", + "start": 133.912, + "duration": 2.109, + }, + {"text": "♪ Hail to the blue, hail ♪", "start": 136.021, "duration": 3.643}, + {"text": "♪ To Michigan ♪", "start": 139.664, "duration": 4.105}, + {"text": "♪ The champions of the west ♪", "start": 143.769, "duration": 3.667}, + {"text": "♪ Go blue ♪", "start": 154.122, "duration": 2.167}, + ] + test_transcript_chunks = [ + Document( + page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The leaders and best ♪", # noqa: E501 + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=0s", + "start_seconds": 0, + "start_timestamp": "00:00:00", + }, + ), + Document( + page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪ ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪", # noqa: E501 + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=30s", + "start_seconds": 30, + "start_timestamp": "00:00:30", + }, + ), + Document( + page_content="♪ The leaders and best ♪ ♪ Hail to victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪ (choir clapping rhythmically) - Go blue! (choir clapping rhythmically) Go blue! ♪ It's great to be ♪ ♪ A Michigan Wolverine ♪\n- Go blue!", # noqa: E501 + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=60s", + "start_seconds": 60, + "start_timestamp": "00:01:00", + }, + ), + Document( + page_content="♪ It's great to be ♪ ♪ A Michigan Wolverine ♪\n- Go blue! ♪ It's great to be ♪\n(choir scatting) ♪ a Michigan Wolverine ♪\n(choir scatting) ♪ It's great to be ♪\n(choir scatting) ♪ A Michigan ♪\n(choir scatting) - Let's go blue! ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The leaders and best ♪ ♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail, hail to Michigan ♪ ♪ The champions of the west ♪", # noqa: E501 + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=90s", + "start_seconds": 90, + "start_timestamp": "00:01:30", + }, + ), + Document( + page_content="♪ Hail to the victors valiant ♪ ♪ Hail to the conquering heroes ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail to the blue ♪ ♪ Hail to the blue, hail ♪ ♪ To Michigan ♪ ♪ The champions of the west ♪", # noqa: E501 + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=120s", + "start_seconds": 120, + "start_timestamp": "00:02:00", + }, + ), + Document( + page_content="♪ Go blue ♪", + metadata={ + "source": "https://www.youtube.com/watch?v=TKCMw0utiak&t=150s", + "start_seconds": 150, + "start_timestamp": "00:02:30", + }, + ), + ] + + ytl = YoutubeLoader( + "TKCMw0utiak", + transcript_format=TranscriptFormat.CHUNKS, + chunk_size_seconds=30, + ) + assert ( + list(ytl._get_transcript_chunks(test_transcript_pieces)) + == test_transcript_chunks + )