Skip to content

Commit

Permalink
Add security note to recursive url loader (#11934)
Browse files Browse the repository at this point in the history
Add security note to recursive loader
  • Loading branch information
eyurtsev committed Oct 17, 2023
1 parent 42dcc50 commit 9ecb724
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions libs/langchain/langchain/document_loaders/recursive_url_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:


class RecursiveUrlLoader(BaseLoader):
"""Load all child links from a URL page."""
"""Load all child links from a URL page.
**Security Note**: This loader is a crawler that will start crawling
at a given URL and then expand to crawl child links recursively.
Web crawlers should generally NOT be deployed with network access
to any internal servers.
Control access to who can submit crawling requests and what network access
the crawler has.
While crawling, the crawler may encounter malicious URLs that would lead to a
server-side request forgery (SSRF) attack.
To mitigate risks, the crawler by default will only load URLs from the same
domain as the start URL (controlled via prevent_outside named argument).
This will mitigate the risk of SSRF attacks, but will not eliminate it.
For example, if crawling a host which hosts several sites:
https://some_host/alice_site/
https://some_host/bob_site/
A malicious URL on Alice's site could cause the crawler to make a malicious
GET request to an endpoint on Bob's site. Both sites are hosted on the
same host, so such a request would not be prevented by default.
See https://python.langchain.com/docs/security
"""

def __init__(
self,
Expand All @@ -60,12 +89,13 @@ def __init__(
metadata_extractor: Optional[Callable[[str, str], str]] = None,
exclude_dirs: Optional[Sequence[str]] = (),
timeout: Optional[int] = 10,
prevent_outside: Optional[bool] = True,
prevent_outside: bool = True,
link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None,
check_response_status: bool = False,
) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude.
Args:
url: The URL to crawl.
max_depth: The max depth of the recursive loading.
Expand Down

0 comments on commit 9ecb724

Please sign in to comment.