langchain-ai · baskaryan · Sep 21, 2023 · Sep 21, 2023
diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -63,6 +63,7 @@ def __init__(
         prevent_outside: Optional[bool] = True,
         link_regex: Union[str, re.Pattern, None] = None,
         headers: Optional[dict] = None,
+        check_response_status: bool = False,
     ) -> None:
         """Initialize with URL to crawl and any subdirectories to exclude.
         Args:
@@ -84,6 +85,8 @@ def __init__(
             prevent_outside: If True, prevent loading from urls which are not children
                 of the root url.
             link_regex: Regex for extracting sub-links from the raw html of a web page.
+            check_response_status: If True, check HTTP response status and skip
+                URLs with error responses (400-599).
         """
 
         self.url = url
@@ -101,6 +104,7 @@ def __init__(
         self.link_regex = link_regex
         self._lock = asyncio.Lock() if self.use_async else None
         self.headers = headers
+        self.check_response_status = check_response_status
 
     def _get_child_links_recursive(
         self, url: str, visited: Set[str], *, depth: int = 0
@@ -123,8 +127,13 @@ def _get_child_links_recursive(
         visited.add(url)
         try:
             response = requests.get(url, timeout=self.timeout, headers=self.headers)
-        except Exception:
-            logger.warning(f"Unable to load from {url}")
+            if self.check_response_status and 400 <= response.status_code <= 599:
+                raise ValueError(f"Received HTTP status {response.status_code}")
+        except Exception as e:
+            logger.warning(
+                f"Unable to load from {url}. Received error {e} of type "
+                f"{e.__class__.__name__}"
+            )
             return
         content = self.extractor(response.text)
         if content:
@@ -193,6 +202,8 @@ async def _async_get_child_links_recursive(
         try:
             async with session.get(url) as response:
                 text = await response.text()
+                if self.check_response_status and 400 <= response.status <= 599:
+                    raise ValueError(f"Received HTTP status {response.status}")
         except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
             logger.warning(
                 f"Unable to load {url}. Received error {e} of type "

diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py
@@ -12,9 +12,10 @@ def test_async_recursive_url_loader() -> None:
         use_async=True,
         max_depth=3,
         timeout=None,
+        check_response_status=True,
     )
     docs = loader.load()
-    assert len(docs) == 890
+    assert len(docs) == 513
     assert docs[0].page_content == "placeholder"