Harrison/unstructured page number (langchain-ai#6464)

Co-authored-by: Reza Sanaie <reza@sanaie.ca>
kacperlukawski · Jun 29, 2023 · 165c600 · 165c600
1 parent c3c7d6d
commit 165c600
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 4 deletions.
diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb
@@ -226,13 +226,17 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "8de9ef16",
    "metadata": {},
    "source": [
     "## PDF Example\n",
     "\n",
-    "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
+    "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
+    "- `single` all the text from all elements are combined into one (default)\n",
+    "- `elements` maintain individual elements\n",
+    "- `paged` texts from each page are only combined"
    ]
   },
   {

diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py
@@ -1,7 +1,7 @@
 """Loader that uses unstructured to load files."""
 import collections
 from abc import ABC, abstractmethod
-from typing import IO, Any, List, Sequence, Union
+from typing import IO, Any, Dict, List, Sequence, Union
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@@ -45,7 +45,7 @@ def __init__(self, mode: str = "single", **unstructured_kwargs: Any):
                 "unstructured package not found, please install it with "
                 "`pip install unstructured`"
             )
-        _valid_modes = {"single", "elements"}
+        _valid_modes = {"single", "elements", "paged"}
         if mode not in _valid_modes:
             raise ValueError(
                 f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
@@ -80,6 +80,31 @@ def load(self) -> List[Document]:
                 if hasattr(element, "category"):
                     metadata["category"] = element.category
                 docs.append(Document(page_content=str(element), metadata=metadata))
+        elif self.mode == "paged":
+            text_dict: Dict[int, str] = {}
+            meta_dict: Dict[int, Dict] = {}
+
+            for idx, element in enumerate(elements):
+                metadata = self._get_metadata()
+                if hasattr(element, "metadata"):
+                    metadata.update(element.metadata.to_dict())
+                page_number = metadata.get("page_number", 1)
+
+                # Check if this page_number already exists in docs_dict
+                if page_number not in text_dict:
+                    # If not, create new entry with initial text and metadata
+                    text_dict[page_number] = str(element) + "\n\n"
+                    meta_dict[page_number] = metadata
+                else:
+                    # If exists, append to text and update the metadata
+                    text_dict[page_number] += str(element) + "\n\n"
+                    meta_dict[page_number].update(metadata)
+
+            # Convert the dict to a list of Document objects
+            docs = [
+                Document(page_content=text_dict[key], metadata=meta_dict[key])
+                for key in text_dict.keys()
+            ]
         elif self.mode == "single":
             metadata = self._get_metadata()
             text = "\n\n".join([str(el) for el in elements])

diff --git a/tests/integration_tests/document_loaders/test_pdf.py b/tests/integration_tests/document_loaders/test_pdf.py
@@ -11,7 +11,25 @@
 )
 
 
-def test_unstructured_pdf_loader() -> None:
+def test_unstructured_pdf_loader_elements_mode() -> None:
+    """Test unstructured loader with various modes."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = UnstructuredPDFLoader(str(file_path), mode="elements")
+    docs = loader.load()
+
+    assert len(docs) == 2
+
+
+def test_unstructured_pdf_loader_paged_mode() -> None:
+    """Test unstructured loader with various modes."""
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = UnstructuredPDFLoader(str(file_path), mode="paged")
+    docs = loader.load()
+
+    assert len(docs) == 16
+
+
+def test_unstructured_pdf_loader_default_mode() -> None:
     """Test unstructured loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
     loader = UnstructuredPDFLoader(str(file_path))