Skip to content

Commit

Permalink
recursive loader add status check (#10891)
Browse files Browse the repository at this point in the history
  • Loading branch information
baskaryan committed Sep 21, 2023
1 parent 6e02c45 commit c1f9cc0
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
15 changes: 13 additions & 2 deletions libs/langchain/langchain/document_loaders/recursive_url_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(
prevent_outside: Optional[bool] = True,
link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None,
check_response_status: bool = False,
) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude.
Args:
Expand All @@ -84,6 +85,8 @@ def __init__(
prevent_outside: If True, prevent loading from urls which are not children
of the root url.
link_regex: Regex for extracting sub-links from the raw html of a web page.
check_response_status: If True, check HTTP response status and skip
URLs with error responses (400-599).
"""

self.url = url
Expand All @@ -101,6 +104,7 @@ def __init__(
self.link_regex = link_regex
self._lock = asyncio.Lock() if self.use_async else None
self.headers = headers
self.check_response_status = check_response_status

def _get_child_links_recursive(
self, url: str, visited: Set[str], *, depth: int = 0
Expand All @@ -123,8 +127,13 @@ def _get_child_links_recursive(
visited.add(url)
try:
response = requests.get(url, timeout=self.timeout, headers=self.headers)
except Exception:
logger.warning(f"Unable to load from {url}")
if self.check_response_status and 400 <= response.status_code <= 599:
raise ValueError(f"Received HTTP status {response.status_code}")
except Exception as e:
logger.warning(
f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return
content = self.extractor(response.text)
if content:
Expand Down Expand Up @@ -193,6 +202,8 @@ async def _async_get_child_links_recursive(
try:
async with session.get(url) as response:
text = await response.text()
if self.check_response_status and 400 <= response.status <= 599:
raise ValueError(f"Received HTTP status {response.status}")
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
logger.warning(
f"Unable to load {url}. Received error {e} of type "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ def test_async_recursive_url_loader() -> None:
use_async=True,
max_depth=3,
timeout=None,
check_response_status=True,
)
docs = loader.load()
assert len(docs) == 890
assert len(docs) == 513
assert docs[0].page_content == "placeholder"


Expand Down

0 comments on commit c1f9cc0

Please sign in to comment.