Skip to content

Commit

Permalink
To convert Gdoc and GSlide,
Browse files Browse the repository at this point in the history
manage headers, table, links and bullet
  • Loading branch information
pprados committed Aug 28, 2023
1 parent 9d1c6cf commit b67063c
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 70 deletions.
114 changes: 103 additions & 11 deletions docs/extras/integrations/document_loaders/google_drive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,45 +35,137 @@
{
"cell_type": "code",
"execution_count": 1,
"id": "6e40071c-3a65-4e26-b497-3e2be0bd86b9",
"id": "afc9c859-042e-4c2a-b31e-8c18eb00586a",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'langchain_googledrive'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#pip install langchain-googledrive\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_googledrive\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GoogleDriveLoader\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_googledrive'"
]
}
],
"source": [
"#!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib"
"#pip install langchain-googledrive\n",
"from langchain_googledrive.document_loaders import GoogleDriveLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6e40071c-3a65-4e26-b497-3e2be0bd86b9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting google-api-python-client\n",
" Using cached google_api_python_client-2.97.0-py2.py3-none-any.whl (12.0 MB)\n",
"Collecting google-auth-httplib2\n",
" Using cached google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)\n",
"Collecting google-auth-oauthlib\n",
" Using cached google_auth_oauthlib-1.0.0-py2.py3-none-any.whl (18 kB)\n",
"Collecting uritemplate<5,>=3.0.1\n",
" Using cached uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)\n",
"Requirement already satisfied: httplib2<1.dev0,>=0.15.0 in /usr/lib/python3/dist-packages (from google-api-python-client) (0.20.2)\n",
"Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5\n",
" Using cached google_api_core-2.11.1-py3-none-any.whl (120 kB)\n",
"Collecting google-auth<3.0.0.dev0,>=1.19.0\n",
" Using cached google_auth-2.22.0-py2.py3-none-any.whl (181 kB)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from google-auth-httplib2) (1.16.0)\n",
"Collecting requests-oauthlib>=0.7.0\n",
" Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n",
"Collecting googleapis-common-protos<2.0.dev0,>=1.56.2\n",
" Using cached googleapis_common_protos-1.60.0-py2.py3-none-any.whl (227 kB)\n",
"Requirement already satisfied: requests<3.0.0.dev0,>=2.18.0 in /home/pprados/.pythonapp/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.31.0)\n",
"Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5\n",
" Downloading protobuf-4.24.2-cp37-abi3-manylinux2014_x86_64.whl (311 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.4/311.4 KB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: urllib3<2.0 in /usr/lib/python3/dist-packages (from google-auth<3.0.0.dev0,>=1.19.0->google-api-python-client) (1.26.5)\n",
"Collecting cachetools<6.0,>=2.0.0\n",
" Using cached cachetools-5.3.1-py3-none-any.whl (9.3 kB)\n",
"Collecting pyasn1-modules>=0.2.1\n",
" Using cached pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n",
"Collecting rsa<5,>=3.1.4\n",
" Using cached rsa-4.9-py3-none-any.whl (34 kB)\n",
"Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/lib/python3/dist-packages (from httplib2<1.dev0,>=0.15.0->google-api-python-client) (2.4.7)\n",
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/lib/python3/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib) (3.2.0)\n",
"Collecting pyasn1<0.6.0,>=0.4.6\n",
" Using cached pyasn1-0.5.0-py2.py3-none-any.whl (83 kB)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2020.6.20)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/pprados/.pythonapp/lib/python3.10/site-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.2.0)\n",
"Installing collected packages: uritemplate, pyasn1, protobuf, cachetools, rsa, requests-oauthlib, pyasn1-modules, googleapis-common-protos, google-auth, google-auth-oauthlib, google-auth-httplib2, google-api-core, google-api-python-client\n",
"Successfully installed cachetools-5.3.1 google-api-core-2.11.1 google-api-python-client-2.97.0 google-auth-2.22.0 google-auth-httplib2-0.1.0 google-auth-oauthlib-1.0.0 googleapis-common-protos-1.60.0 protobuf-4.24.2 pyasn1-0.5.0 pyasn1-modules-0.3.0 requests-oauthlib-1.3.1 rsa-4.9 uritemplate-4.1.1\n"
]
}
],
"source": [
"!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9bcb6cb1",
"metadata": {},
"outputs": [],
"source": [
"folder_id='root'\n",
"#folder_id='1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5'\n",
"#folder_id='18A21b37hPISOQtStQ_irQLYS3hlVEsBH'"
"#folder_id='1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "878928a6-a5ae-4f74-b351-64e3b01733fe",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'langchain_googledrive'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_googledrive\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GoogleDriveLoader\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_googledrive'"
]
}
],
"source": [
"from langchain.document_loaders import GoogleDriveLoader"
"from langchain_googledrive.document_loaders import GoogleDriveLoader"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "2216c83f-68e4-4d2f-8ea2-5878fb18bbe7",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'GoogleDriveLoader' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loader \u001b[38;5;241m=\u001b[39m \u001b[43mGoogleDriveLoader\u001b[49m(\n\u001b[1;32m 2\u001b[0m folder_id\u001b[38;5;241m=\u001b[39mfolder_id,\n\u001b[1;32m 3\u001b[0m recursive\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 4\u001b[0m num_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, \u001b[38;5;66;03m# Maximum number of file to load\u001b[39;00m\n\u001b[1;32m 5\u001b[0m )\n",
"\u001b[0;31mNameError\u001b[0m: name 'GoogleDriveLoader' is not defined"
]
}
],
"source": [
"loader = GoogleDriveLoader(\n",
" folder_id=folder_id,\n",
Expand Down Expand Up @@ -522,7 +614,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0rc1"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions docs/extras/integrations/providers/google_drive.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib

## Document Loader

See a [usage example and authorizing instructions](/docs/integrations/document_loaders/google_drive.html).
See a [usage example and authorizing instructions](/docs/extras/integrations/document_loaders/google_drive.html).


```python
Expand All @@ -23,15 +23,15 @@ from langchain.document_loaders import GoogleDriveLoader

## Retriever

See a [usage example and authorizing instructions](/docs/modules/data_connection/retrievers/integrations/google_drive.html).
See a [usage example and authorizing instructions](/docs/extras/modules/data_connection/retrievers/integrations/google_drive.html).

```python
from langchain.retrievers import GoogleDriveRetriever
```

## Tools

See a [usage example and authorizing instructions](/docs/modules/agents/tools/integrations/google_drive.html).
See a [usage example and authorizing instructions](/docs/extras/modules/agents/tools/integrations/google_drive.html).

```python
from langchain.tools import GoogleDriveSearchTool
Expand Down
9 changes: 8 additions & 1 deletion libs/langchain/langchain/utilities/google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,7 +1419,14 @@ def visitor(result: List[str], node: Any, parent: str) -> List[str]:
result.append("- " if "bullet" in node["paragraphMarker"] else "")
return result
if "paragraph" in node:
result.append("- " if "bullet" in node["paragraph"] else "")
prefix=""
named_style_type= node['paragraph']["paragraphStyle"]["namedStyleType"]
level=re.match("HEADING_([1-9])",named_style_type)
if level:
prefix = f"{'#' * int(level[1])} "
if "bullet" in node["paragraph"]:
prefix +="- "
result.append(prefix)
if "table" in node:
col_size = [0 for _ in range(node["table"]["columns"])]
rows = [[] for _ in range(node["table"]["rows"])]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,22 @@
import pytest
from pytest_mock import MockerFixture

#from langchain_googledrive.document_loaders.google_drive import GoogleDriveLoader
from langchain.document_loaders.google_drive import GoogleDriveLoader

from tests.unit_tests.llms.fake_llm import FakeLLM
from tests.unit_tests.utilities.test_google_drive import (
gdrive_docs,
google_workspace_installed,
patch_google_workspace,
)

try:
import unstructured
unstructured_installed= True
except ImportError:
unstructured_installed= False


@pytest.fixture
def google_workspace(mocker: MockerFixture) -> MagicMock:
Expand Down Expand Up @@ -178,11 +186,11 @@ def test_deprecated_file_types(google_workspace: MagicMock) -> None:
file_types=[
"document",
"sheet",
"pdf",
# "pdf",
"application/vnd.google-apps.document",
"application/vnd.google-apps.presentation",
"application/vnd.google-apps.spreadsheet",
"application/pdf",
# "application/pdf",
],
)
docs = loader.load()
Expand Down Expand Up @@ -220,10 +228,9 @@ def test_deprecated_service_account_key(google_workspace: MagicMock) -> None:

# Test older ipynb script
@unittest.skipIf(not google_workspace_installed, "Google api not installed")
@unittest.skipIf(not unstructured_installed, "Unstructured api not installed")
def test_old_ipynb(google_workspace: MagicMock) -> None:
# Step 1
from langchain.document_loaders import GoogleDriveLoader

loader = GoogleDriveLoader(
folder_id="999",
recursive=False,
Expand All @@ -237,7 +244,7 @@ def test_old_ipynb(google_workspace: MagicMock) -> None:
loader.load()

# Step 3
from langchain.document_loaders import GoogleDriveLoader, UnstructuredFileIOLoader
from langchain.document_loaders import UnstructuredFileIOLoader

file_id = "1"
loader = GoogleDriveLoader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
from pytest_mock import MockerFixture

#from langchain_googledrive.retrievers.google_drive import GoogleDriveRetriever
from langchain.retrievers.google_drive import GoogleDriveRetriever
from tests.unit_tests.utilities.test_google_drive import (
_text_text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

from langchain import PromptTemplate
from langchain.schema import Document
#from langchain_googledrive.utilities import GoogleDriveAPIWrapper
from langchain.utilities import GoogleDriveAPIWrapper
#from langchain_googledrive.utilities.google_drive import TYPE_CONV_MAPPING, GoogleDriveUtilities
from langchain.utilities.google_drive import TYPE_CONV_MAPPING, GoogleDriveUtilities

try:
Expand Down
50 changes: 1 addition & 49 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ repository = "https://www.github.com/langchain-ai/langchain"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
pytest = "^7.4.0"

[tool.poetry.group.docs.dependencies]
langchain = { path = "libs/langchain/", develop = true }
Expand Down

0 comments on commit b67063c

Please sign in to comment.