From 9d1bd1859666fea1515f0ab06dee0af8f7fb3bbe Mon Sep 17 00:00:00 2001 From: Yaohui Wang Date: Wed, 28 Jun 2023 14:08:05 +0800 Subject: [PATCH] feat (documents): add LarkSuite document loader (#6420) ### Summary This PR adds a LarkSuite (FeiShu) document loader. > [LarkSuite](https://www.larksuite.com/) is an enterprise collaboration platform developed by ByteDance. ### Tests - an integration test case is added - an example notebook showing usage is added. [Notebook preview](https://github.com/yaohui-wyh/langchain/blob/master/docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb) ### Who can review? - PTAL @eyurtsev @hwchase17 --------- Co-authored-by: Yaohui Wang --- .../integrations/larksuite.ipynb | 103 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/larksuite.py | 46 ++++++++ .../document_loaders/test_larksuite.py | 14 +++ 4 files changed, 165 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb create mode 100644 langchain/document_loaders/larksuite.py create mode 100644 tests/integration_tests/document_loaders/test_larksuite.py diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb new file mode 100644 index 00000000000000..03042a91402afd --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "33205b12", + "metadata": {}, + "source": [ + "# LarkSuite (FeiShu)\n", + "\n", + ">[LarkSuite](https://www.larksuite.com/) is an enterprise collaboration platform developed by ByteDance.\n", + "\n", + "This notebook covers how to load data from the `LarkSuite` REST API into a format that can be ingested into LangChain, along with example usage for text summarization.\n", + "\n", + "The LarkSuite API requires an access token (tenant_access_token or user_access_token), checkout [LarkSuite open platform document](https://open.larksuite.com/document) for API details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "90b69c94", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-19T10:05:03.645161Z", + "start_time": "2023-06-19T10:04:49.541968Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "from langchain.document_loaders.larksuite import LarkSuiteDocLoader\n", + "\n", + "DOMAIN = input(\"larksuite domain\")\n", + "ACCESS_TOKEN = getpass(\"larksuite tenant_access_token or user_access_token\")\n", + "DOCUMENT_ID = input(\"larksuite document id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "13deb0f5", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-19T10:05:36.016495Z", + "start_time": "2023-06-19T10:05:35.360884Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Test Doc\\nThis is a Test Doc\\n\\n1\\n2\\n3\\n\\n', metadata={'document_id': 'V76kdbd2HoBbYJxdiNNccajunPf', 'revision_id': 11, 'title': 'Test Doc'})]\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "larksuite_loader = LarkSuiteDocLoader(DOMAIN, ACCESS_TOKEN, DOCUMENT_ID)\n", + "docs = larksuite_loader.load()\n", + "\n", + "pprint(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ccc1e2f", + "metadata": {}, + "outputs": [], + "source": [ + "# see https://python.langchain.com/docs/use_cases/summarization for more details\n", + "from langchain.chains.summarize import load_summarize_chain\n", + "\n", + "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n", + "chain.run(docs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index cf3e243a78c964..a32397ab96d6c3 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -63,6 +63,7 @@ from langchain.document_loaders.iugu import IuguLoader from langchain.document_loaders.joplin import JoplinLoader from langchain.document_loaders.json_loader import JSONLoader +from langchain.document_loaders.larksuite import LarkSuiteDocLoader from langchain.document_loaders.markdown import UnstructuredMarkdownLoader from langchain.document_loaders.mastodon import MastodonTootsLoader from langchain.document_loaders.max_compute import MaxComputeLoader @@ -204,6 +205,7 @@ "IuguLoader", "JSONLoader", "JoplinLoader", + "LarkSuiteDocLoader", "MWDumpLoader", "MastodonTootsLoader", "MathpixPDFLoader", diff --git a/langchain/document_loaders/larksuite.py b/langchain/document_loaders/larksuite.py new file mode 100644 index 00000000000000..b7263e44703fac --- /dev/null +++ b/langchain/document_loaders/larksuite.py @@ -0,0 +1,46 @@ +"""Loader that loads LarkSuite (FeiShu) document json dump.""" +import json +import urllib.request +from typing import Any, Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class LarkSuiteDocLoader(BaseLoader): + """Loader that loads LarkSuite (FeiShu) document.""" + + def __init__(self, domain: str, access_token: str, document_id: str): + """Initialize with domain, access_token (tenant / user), and document_id.""" + self.domain = domain + self.access_token = access_token + self.document_id = document_id + + def _get_larksuite_api_json_data(self, api_url: str) -> Any: + """Get LarkSuite (FeiShu) API response json data.""" + headers = {"Authorization": f"Bearer {self.access_token}"} + request = urllib.request.Request(api_url, headers=headers) + with urllib.request.urlopen(request) as response: + json_data = json.loads(response.read().decode()) + return json_data + + def lazy_load(self) -> Iterator[Document]: + """Lazy load LarkSuite (FeiShu) document.""" + api_url_prefix = f"{self.domain}/open-apis/docx/v1/documents" + metadata_json = self._get_larksuite_api_json_data( + f"{api_url_prefix}/{self.document_id}" + ) + raw_content_json = self._get_larksuite_api_json_data( + f"{api_url_prefix}/{self.document_id}/raw_content" + ) + text = raw_content_json["data"]["content"] + metadata = { + "document_id": self.document_id, + "revision_id": metadata_json["data"]["document"]["revision_id"], + "title": metadata_json["data"]["document"]["title"], + } + yield Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """Load LarkSuite (FeiShu) document.""" + return list(self.lazy_load()) diff --git a/tests/integration_tests/document_loaders/test_larksuite.py b/tests/integration_tests/document_loaders/test_larksuite.py new file mode 100644 index 00000000000000..147d8ee8018bdb --- /dev/null +++ b/tests/integration_tests/document_loaders/test_larksuite.py @@ -0,0 +1,14 @@ +from langchain.document_loaders.larksuite import LarkSuiteDocLoader + +DOMAIN = "" +ACCESS_TOKEN = "" +DOCUMENT_ID = "" + + +def test_larksuite_doc_loader() -> None: + """Test LarkSuite (FeiShu) document loader.""" + loader = LarkSuiteDocLoader(DOMAIN, ACCESS_TOKEN, DOCUMENT_ID) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content is not None