Skip to content

Commit

Permalink
Harrison/unstructured page number (langchain-ai#6464)
Browse files Browse the repository at this point in the history
Co-authored-by: Reza Sanaie <reza@sanaie.ca>
  • Loading branch information
2 people authored and kacperlukawski committed Jun 29, 2023
1 parent c3c7d6d commit 165c600
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,17 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "8de9ef16",
"metadata": {},
"source": [
"## PDF Example\n",
"\n",
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
"- `single` all the text from all elements are combined into one (default)\n",
"- `elements` maintain individual elements\n",
"- `paged` texts from each page are only combined"
]
},
{
Expand Down
29 changes: 27 additions & 2 deletions langchain/document_loaders/unstructured.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from typing import IO, Any, List, Sequence, Union
from typing import IO, Any, Dict, List, Sequence, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
Expand Down Expand Up @@ -45,7 +45,7 @@ def __init__(self, mode: str = "single", **unstructured_kwargs: Any):
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
_valid_modes = {"single", "elements"}
_valid_modes = {"single", "elements", "paged"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
Expand Down Expand Up @@ -80,6 +80,31 @@ def load(self) -> List[Document]:
if hasattr(element, "category"):
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
elif self.mode == "paged":
text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {}

for idx, element in enumerate(elements):
metadata = self._get_metadata()
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
page_number = metadata.get("page_number", 1)

# Check if this page_number already exists in docs_dict
if page_number not in text_dict:
# If not, create new entry with initial text and metadata
text_dict[page_number] = str(element) + "\n\n"
meta_dict[page_number] = metadata
else:
# If exists, append to text and update the metadata
text_dict[page_number] += str(element) + "\n\n"
meta_dict[page_number].update(metadata)

# Convert the dict to a list of Document objects
docs = [
Document(page_content=text_dict[key], metadata=meta_dict[key])
for key in text_dict.keys()
]
elif self.mode == "single":
metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements])
Expand Down
20 changes: 19 additions & 1 deletion tests/integration_tests/document_loaders/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,25 @@
)


def test_unstructured_pdf_loader() -> None:
def test_unstructured_pdf_loader_elements_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
docs = loader.load()

assert len(docs) == 2


def test_unstructured_pdf_loader_paged_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
docs = loader.load()

assert len(docs) == 16


def test_unstructured_pdf_loader_default_mode() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path))
Expand Down

0 comments on commit 165c600

Please sign in to comment.