langchain/document_loaders/youtube.py

"""Loader that loads YouTube transcript."""
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Optional

from pydantic import root_validator
from pydantic.dataclasses import dataclass

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]


@dataclass
class GoogleApiClient:
    """A Generic Google Api Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"


    Example:
        .. code-block:: python

            from langchain.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    """

    credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
    service_account_path: Path = Path.home() / ".credentials" / "credentials.json"
    token_path: Path = Path.home() / ".credentials" / "token.json"

    def __post_init__(self) -> None:
        self.creds = self._load_credentials()

    @root_validator
    def validate_channel_or_videoIds_is_set(
        cls, values: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Validate that either folder_id or document_ids is set, but not both."""

        if not values.get("credentials_path") and not values.get(
            "service_account_path"
        ):
            raise ValueError("Must specify either channel_name or video_ids")
        return values

    def _load_credentials(self) -> Any:
        """Load credentials."""
        # Adapted from https://developers.google.com/drive/api/v3/quickstart/python
        try:
            from google.auth.transport.requests import Request
            from google.oauth2 import service_account
            from google.oauth2.credentials import Credentials
            from google_auth_oauthlib.flow import InstalledAppFlow
            from youtube_transcript_api import YouTubeTranscriptApi  # noqa: F401
        except ImportError:
            raise ImportError(
                "You must run"
                "`pip install --upgrade "
                "google-api-python-client google-auth-httplib2 "
                "google-auth-oauthlib"
                "youtube-transcript-api`"
                "to use the Google Drive loader"
            )

        creds = None
        if self.service_account_path.exists():
            return service_account.Credentials.from_service_account_file(
                str(self.service_account_path)
            )
        if self.token_path.exists():
            creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)

        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    str(self.credentials_path), SCOPES
                )
                creds = flow.run_local_server(port=0)
            with open(self.token_path, "w") as token:
                token.write(creds.to_json())

        return creds


class YoutubeLoader(BaseLoader):
    """Loader that loads Youtube transcripts."""

    def __init__(
        self, video_id: str, add_video_info: bool = False, language: str = "en"
    ):
        """Initialize with YouTube video ID."""
        self.video_id = video_id
        self.add_video_info = add_video_info
        self.language = language

    @classmethod
    def from_youtube_channel(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
        """Given a channel name, load all videos."""
        video_id = youtube_url.split("youtube.com/watch?v=")[-1]
        return cls(video_id, **kwargs)

    def load(self) -> List[Document]:
        """Load documents."""
        try:
            from youtube_transcript_api import YouTubeTranscriptApi
        except ImportError:
            raise ImportError(
                "Could not import youtube_transcript_api python package. "
                "Please it install it with `pip install youtube-transcript-api`."
            )

        metadata = {"source": self.video_id}

        if self.add_video_info:
            # Get more video meta info
            # Such as title, description, thumbnail url, publish_date
            video_info = self._get_video_info()
            metadata.update(video_info)

        transcript_pieces = YouTubeTranscriptApi.get_transcript(
            self.video_id, languages=[self.language]
        )
        transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])

        return [Document(page_content=transcript, metadata=metadata)]

    def _get_video_info(self) -> dict:
        """Get important video information.

        Components are:
            - title
            - description
            - thumbnail url,
            - publish_date
            - channel_author
            - and more.
        """
        try:
            from pytube import YouTube

        except ImportError:
            raise ImportError(
                "Could not import pytube python package. "
                "Please it install it with `pip install pytube`."
            )
        yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
        video_info = {
            "title": yt.title,
            "description": yt.description,
            "view_count": yt.views,
            "thumbnail_url": yt.thumbnail_url,
            "publish_date": yt.publish_date,
            "length": yt.length,
            "author": yt.author,
        }
        return video_info


@dataclass
class GoogleApiYoutubeLoader(BaseLoader):
    """Loader that loads all Videos from a Channel

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additonali you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"


    Example:
        .. code-block:: python

            from langchain.document_loaders import GoogleApiClient
            from langchain.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    """

    google_api_client: GoogleApiClient
    channel_name: Optional[str] = None
    video_ids: Optional[List[str]] = None
    add_video_info: bool = True
    captions_language: str = "en"

    def __post_init__(self) -> None:
        self.youtube_client = self._build_youtube_client(self.google_api_client.creds)

    def _build_youtube_client(self, creds: Any) -> Any:
        try:
            from googleapiclient.discovery import build
            from youtube_transcript_api import YouTubeTranscriptApi  # noqa: F401
        except ImportError:
            raise ImportError(
                "You must run"
                "`pip install --upgrade "
                "google-api-python-client google-auth-httplib2 "
                "google-auth-oauthlib"
                "youtube-transcript-api`"
                "to use the Google Drive loader"
            )

        return build("youtube", "v3", credentials=creds)

    @root_validator
    def validate_channel_or_videoIds_is_set(
        cls, values: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Validate that either folder_id or document_ids is set, but not both."""
        if not values.get("channel_name") and not values.get("video_ids"):
            raise ValueError("Must specify either channel_name or video_ids")
        return values

    def _get_transcripe_for_video_id(self, video_id: str) -> str:
        from youtube_transcript_api import YouTubeTranscriptApi

        transcript_pieces = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([t["text"].strip(" ") for t in transcript_pieces])

    def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document:
        captions = self._get_transcripe_for_video_id(video_id)
        video_response = (
            self.youtube_client.videos()
            .list(
                part="id,snippet",
                id=video_id,
            )
            .execute()
        )
        return Document(
            page_content=captions,
            metadata=video_response.get("items")[0],
        )

    def _get_channel_id(self, channel_name: str) -> str:
        request = self.youtube_client.search().list(
            part="id",
            q=channel_name,
            type="channel",
            maxResults=1,  # we only need one result since channel names are unique
        )
        response = request.execute()
        channel_id = response["items"][0]["id"]["channelId"]
        return channel_id

    def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
        channel_id = self._get_channel_id(channel)
        request = self.youtube_client.search().list(
            part="id,snippet",
            channelId=channel_id,
            maxResults=50,  # adjust this value to retrieve more or fewer videos
        )
        video_ids = []
        while request is not None:
            response = request.execute()

            # Add each video ID to the list
            for item in response["items"]:
                if not item["id"].get("videoId"):
                    continue
                meta_data = {"videoId": item["id"]["videoId"]}
                if self.add_video_info:
                    item["snippet"].pop("thumbnails")
                    meta_data.update(item["snippet"])
                video_ids.append(
                    Document(
                        page_content=self._get_transcripe_for_video_id(
                            item["id"]["videoId"]
                        ),
                        metadata=meta_data,
                    )
                )
            request = self.youtube_client.search().list_next(request, response)

        return video_ids

    def load(self) -> List[Document]:
        """Load documents."""
        document_list = []
        if self.channel_name:
            document_list.extend(self._get_document_for_channel(self.channel_name))
        elif self.video_ids:
            document_list.extend(
                [
                    self._get_document_for_video_id(video_id)
                    for video_id in self.video_ids
                ]
            )
        else:
            raise ValueError("Must specify either channel_name or video_ids")
        return document_list