Skip to content

Commit

Permalink
Add bucket and object key to metadata in S3 loader (#9317)
Browse files Browse the repository at this point in the history
- Description: this PR adds `s3_object_key` and `s3_bucket` to the doc
metadata when loading an S3 file. This is particularly useful when using
`S3DirectoryLoader` to remove the files from the dir once they have been
processed (getting the object keys from the metadata `source` field
seems brittle)
  - Dependencies: N/A
  - Tag maintainer: ?
  - Twitter handle: _cbornet

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
  • Loading branch information
cbornet and eyurtsev committed Aug 30, 2023
1 parent 24c0b01 commit 9870bfb
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
]
},
"execution_count": 6,
Expand Down
3 changes: 2 additions & 1 deletion docs/extras/integrations/document_loaders/aws_s3_file.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]"
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
]
},
"execution_count": 9,
Expand Down Expand Up @@ -96,3 +96,4 @@
"nbformat": 4,
"nbformat_minor": 5
}

19 changes: 11 additions & 8 deletions libs/langchain/langchain/document_loaders/s3_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
import tempfile
from typing import List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.unstructured import UnstructuredBaseLoader


class S3FileLoader(BaseLoader):
class S3FileLoader(UnstructuredBaseLoader):
"""Load from `Amazon AWS S3` file."""

def __init__(self, bucket: str, key: str):
Expand All @@ -17,11 +15,14 @@ def __init__(self, bucket: str, key: str):
bucket: The name of the S3 bucket.
key: The key of the S3 object.
"""
super().__init__()
self.bucket = bucket
self.key = key

def load(self) -> List[Document]:
"""Load documents."""
def _get_elements(self) -> List:
"""Get elements."""
from unstructured.partition.auto import partition

try:
import boto3
except ImportError:
Expand All @@ -34,5 +35,7 @@ def load(self) -> List[Document]:
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
loader = UnstructuredFileLoader(file_path)
return loader.load()
return partition(filename=file_path)

def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}

0 comments on commit 9870bfb

Please sign in to comment.