Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load specific file types from Google Drive (issue #4878) #4926

Merged
merged 7 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/modules/indexes/document_loaders/examples/google_drive.ipynb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {},
Expand Down Expand Up @@ -75,6 +76,29 @@
"source": [
"docs = loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2721ba8a",
"metadata": {},
"source": [
"When you pass a `folder_id` by default all files of type document, sheet and pdf are loaded. You can modify this behaviour by passing a `file_types` argument "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ff83b4c",
"metadata": {},
"outputs": [],
"source": [
"loader = GoogleDriveLoader(\n",
" folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\",\n",
" file_types=[\"document\", \"sheet\"]\n",
" recursive=False\n",
")"
]
}
],
"metadata": {
Expand Down
52 changes: 45 additions & 7 deletions langchain/document_loaders/googledrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# https://cloud.google.com/iam/docs/service-accounts-create

from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Sequence, Union

from pydantic import BaseModel, root_validator, validator

Expand All @@ -30,11 +30,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
document_ids: Optional[List[str]] = None
file_ids: Optional[List[str]] = None
recursive: bool = False
file_types: Optional[Sequence[str]] = None

@root_validator
def validate_folder_id_or_document_ids(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that either folder_id or document_ids is set, but not both."""
if values.get("folder_id") and (
values.get("document_ids") or values.get("file_ids")
Expand All @@ -49,6 +48,35 @@ def validate_folder_id_or_document_ids(
and not values.get("file_ids")
):
raise ValueError("Must specify either folder_id, document_ids, or file_ids")

file_types = values.get("file_types")
if file_types:
if values.get("document_ids") or values.get("file_ids"):
raise ValueError(
"file_types can only be given when folder_id is given,"
" (not when document_ids or file_ids are given)."
)
type_mapping = {
"document": "application/vnd.google-apps.document",
"sheet": "application/vnd.google-apps.spreadsheet",
"pdf": "application/pdf",
}
allowed_types = list(type_mapping.keys()) + list(type_mapping.values())
short_names = ", ".join([f"'{x}'" for x in type_mapping.keys()])
full_names = ", ".join([f"'{x}'" for x in type_mapping.values()])
for file_type in file_types:
if file_type not in allowed_types:
raise ValueError(
f"Given file type {file_type} is not supported. "
f"Supported values are: {short_names}; and "
f"their full-form names: {full_names}"
)

# replace short-form file types by full-form file types
def full_form(x: str) -> str:
return type_mapping[x] if x in type_mapping else x

values["file_types"] = [full_form(file_type) for file_type in file_types]
return values

@validator("credentials_path")
Expand Down Expand Up @@ -171,15 +199,23 @@ def _load_document_from_id(self, id: str) -> Document:
}
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(self, folder_id: str) -> List[Document]:
def _load_documents_from_folder(
self, folder_id: str, *, file_types: Optional[Sequence[str]] = None
) -> List[Document]:
"""Load documents from a folder."""
from googleapiclient.discovery import build

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
files = self._fetch_files_recursive(service, folder_id)
# If file types filter is provided, we'll filter by the file type.
if file_types:
_files = [f for f in files if f["mimeType"] in file_types] # type: ignore
else:
_files = files

returns = []
for file in files:
for file in _files:
if file["mimeType"] == "application/vnd.google-apps.document":
returns.append(self._load_document_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
Expand Down Expand Up @@ -271,7 +307,9 @@ def _load_file_from_ids(self) -> List[Document]:
def load(self) -> List[Document]:
"""Load documents."""
if self.folder_id:
return self._load_documents_from_folder(self.folder_id)
return self._load_documents_from_folder(
self.folder_id, file_types=self.file_types
)
elif self.document_ids:
return self._load_documents_from_ids()
else:
Expand Down