Skip to content

Commit

Permalink
Use default load() implementation in doc loaders
Browse files Browse the repository at this point in the history
  • Loading branch information
cbornet committed Mar 1, 2024
1 parent 1deb8ca commit 707b603
Show file tree
Hide file tree
Showing 49 changed files with 22 additions and 253 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from pathlib import Path
from typing import Iterator, List
from typing import Iterator

from langchain_core.documents import Document

Expand Down Expand Up @@ -74,6 +74,3 @@ def lazy_load(self) -> Iterator[Document]:
}

yield Document(page_content=text, metadata=metadata)

def load(self) -> List[Document]:
return list(self.lazy_load())
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Iterator, List, Mapping, Optional
from typing import Any, Callable, Iterator, Mapping, Optional

from langchain_core.documents import Document
from langchain_core.utils.utils import guard_import
Expand Down Expand Up @@ -53,9 +53,6 @@ def _handle_record(
self._stream_name = stream_name
self._state = state

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
return self._integration._load_data(
stream_name=self._stream_name, state=self._state
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, List
from typing import Iterator

from langchain_core.documents import Document

Expand Down Expand Up @@ -34,7 +34,3 @@ def lazy_load(self) -> Iterator[Document]:
"table_id": self.table_id,
},
)

def load(self) -> List[Document]:
"""Load Documents from table."""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,3 @@ def lazy_load(self) -> Iterator[Document]:
)

yield Document(page_content=page_content, metadata=metadata)

def load(self) -> List[Document]:
"""Load all records from FeatureLayer."""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,6 @@ def __init__(
self.nb_prefetched = nb_prefetched
self.extraction_function = extraction_function

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
for doc in self.collection.paginated_find(
filter=self.filter,
Expand Down
4 changes: 0 additions & 4 deletions libs/community/langchain_community/document_loaders/athena.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,3 @@ def lazy_load(self) -> Iterator[Document]:
}
doc = Document(page_content=page_content, metadata=metadata)
yield doc

def load(self) -> List[Document]:
"""Load data into document objects."""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, List, Optional
from typing import Iterator, Optional

from langchain_community.docstore.document import Document
from langchain_community.document_loaders.base import BaseLoader
Expand All @@ -16,10 +16,6 @@ def __init__(self, url: str, glob: Optional[str] = None):
self.glob_pattern = glob
"""Optional glob pattern to select files. Defaults to None."""

def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
"""A lazy loader for Documents."""
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterator, List
from typing import Any, Iterator

from langchain_core.documents import Document

Expand All @@ -18,9 +18,6 @@ def __init__(self, conf: Any, bucket: str, prefix: str = ""):
self.bucket = bucket
self.prefix = prefix

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
"""Load documents."""
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
import tempfile
from typing import Any, Iterator, List
from typing import Any, Iterator

from langchain_core.documents import Document

Expand All @@ -24,9 +24,6 @@ def __init__(self, conf: Any, bucket: str, key: str):
self.bucket = bucket
self.key = key

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
"""Load documents."""
try:
Expand Down
13 changes: 0 additions & 13 deletions libs/community/langchain_community/document_loaders/bibtex.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,3 @@ def lazy_load(self) -> Iterator[Document]:
doc = self._load_entry(entry)
if doc:
yield doc

def load(self) -> List[Document]:
"""Load bibtex file documents from the given bibtex file path.
See https://bibtexparser.readthedocs.io/en/master/
Args:
file_path: the path to the bibtex file
Returns:
a list of documents with the document.page_content in text format
"""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,3 @@ def lazy_load(self) -> Iterator[Document]:
"source": url,
},
)

def load(self) -> List[Document]:
"""Load Documents from URLs."""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
Any,
Callable,
Iterator,
List,
Optional,
Sequence,
Union,
Expand Down Expand Up @@ -106,9 +105,6 @@ def __init__(
if query_execution_profile is not _NOT_SET:
self.query_kwargs["execution_profile"] = query_execution_profile

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
for row in self.session.execute(self.query, **self.query_kwargs):
metadata = self.metadata.copy()
Expand Down
11 changes: 0 additions & 11 deletions libs/community/langchain_community/document_loaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,3 @@ def lazy_load(self) -> Iterator[Document]:
html_content = asyncio.run(self.ascrape_playwright(url))
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)

def load(self) -> List[Document]:
"""
Load and return all Documents from the provided URLs.
Returns:
List[Document]: A list of Document objects
containing the scraped content from each URL.
"""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@ def __init__(
self.page_content_fields = page_content_fields
self.metadata_fields = metadata_fields

def load(self) -> List[Document]:
"""Load Couchbase data into Document objects."""
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
"""Load Couchbase data into Document objects lazily."""
from datetime import timedelta
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterator, List
from typing import Any, Iterator

from langchain_core.documents import Document

Expand Down Expand Up @@ -26,10 +26,6 @@ def lazy_load(self) -> Iterator[Document]:
metadata.pop(self.page_content_column)
yield Document(page_content=text, metadata=metadata)

def load(self) -> List[Document]:
"""Load full dataframe."""
return list(self.lazy_load())


class DataFrameLoader(BaseDataFrameLoader):
"""Load `Pandas` DataFrame."""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, List, Optional
from typing import Iterator, Optional

from langchain_core.documents import Document

Expand Down Expand Up @@ -77,10 +77,6 @@ def __init__(
mode=mode,
)

def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())

def lazy_load(
self,
) -> Iterator[Document]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,6 @@ def lazy_load(self) -> Iterator[Document]:
for doc in result:
yield doc

def load(self) -> List[Document]:
"""Load transactions from spcifc account by Etherscan."""
return list(self.lazy_load())

def getNormTx(self) -> List[Document]:
url = (
f"https://api.etherscan.io/api?module=account&action=txlist&address={self.account_address}"
Expand Down
5 changes: 1 addition & 4 deletions libs/community/langchain_community/document_loaders/fauna.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, List, Optional, Sequence
from typing import Iterator, Optional, Sequence

from langchain_core.documents import Document

Expand Down Expand Up @@ -28,9 +28,6 @@ def __init__(
self.secret = secret
self.metadata_fields = metadata_fields

def load(self) -> List[Document]:
return list(self.lazy_load())

def lazy_load(self) -> Iterator[Document]:
try:
from fauna import Page, fql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,6 @@ def lazy_load(
for blob in self.blob_loader.yield_blobs():
yield from self.blob_parser.lazy_parse(blob)

def load(self) -> List[Document]:
"""Load all documents."""
return list(self.lazy_load())

def load_and_split(
self, text_splitter: Optional[TextSplitter] = None
) -> List[Document]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterator, List
from typing import Any, Iterator

from langchain_core.documents import Document

Expand Down Expand Up @@ -67,7 +67,3 @@ def lazy_load(self) -> Iterator[Document]:

# using WKT instead of str() to help GIS system interoperability
yield Document(page_content=geom.wkt, metadata=metadata)

def load(self) -> List[Document]:
"""Load full dataframe."""
return list(self.lazy_load())
26 changes: 0 additions & 26 deletions libs/community/langchain_community/document_loaders/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,32 +127,6 @@ def lazy_load(self) -> Iterator[Document]:
else:
url = None

def load(self) -> List[Document]:
"""
Get issues of a GitHub repository.
Returns:
A list of Documents with attributes:
- page_content
- metadata
- url
- title
- creator
- created_at
- last_update_time
- closed_time
- number of comments
- state
- labels
- assignee
- assignees
- milestone
- locked
- number
- is_pull_request
"""
return list(self.lazy_load())

def parse_issue(self, issue: dict) -> Document:
"""Create Document objects from a list of GitHub issues."""
metadata = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from typing import Iterator, List, Mapping, Optional, Sequence, Union
from typing import Iterator, Mapping, Optional, Sequence, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -84,10 +84,6 @@ def lazy_load(
for row in dataset[key]
)

def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())

def parse_obj(self, page_content: Union[str, object]) -> str:
if isinstance(page_content, object):
return json.dumps(page_content)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,3 @@ def lazy_load(self) -> Iterator[Document]:
page_content=readme_content,
metadata=model,
)

def load(self) -> List[Document]:
"""Load model information, including README content."""
return list(self.lazy_load())
3 changes: 0 additions & 3 deletions libs/community/langchain_community/document_loaders/joplin.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,3 @@ def _convert_date(self, date: int) -> str:

def lazy_load(self) -> Iterator[Document]:
yield from self._get_notes()

def load(self) -> List[Document]:
return list(self.lazy_load())
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import urllib.request
from typing import Any, Iterator, List
from typing import Any, Iterator

from langchain_core.documents import Document

Expand Down Expand Up @@ -46,7 +46,3 @@ def lazy_load(self) -> Iterator[Document]:
"title": metadata_json["data"]["document"]["title"],
}
yield Document(page_content=text, metadata=metadata)

def load(self) -> List[Document]:
"""Load LarkSuite (FeiShu) document."""
return list(self.lazy_load())
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Any, Iterator, List, Optional, Sequence
from typing import Any, Iterator, Optional, Sequence

from langchain_core.documents import Document

Expand Down Expand Up @@ -78,6 +78,3 @@ def lazy_load(self) -> Iterator[Document]:
else:
metadata = {k: v for k, v in row.items() if k not in page_content_data}
yield Document(page_content=page_content, metadata=metadata)

def load(self) -> List[Document]:
return list(self.lazy_load())
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Iterator, List, Optional, Sequence, Union
from typing import Iterator, Optional, Sequence, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -87,11 +87,6 @@ def _load_single_page_from_dump(self, page) -> Document: # type: ignore[no-unty
metadata = {"source": page.title}
return Document(page_content=text, metadata=metadata)

def load(self) -> List[Document]:
"""Load from a file path."""

return [doc for doc in self.lazy_load()]

def lazy_load(
self,
) -> Iterator[Document]:
Expand Down
Loading

0 comments on commit 707b603

Please sign in to comment.