Skip to content

Commit

Permalink
core[patch]: Further restrict recursive URL loader (#15559)
Browse files Browse the repository at this point in the history
Includes code from this PR:  HEAD...m0kr4n3:security/fix_ssrf 
with additional fixes 

Unit tests cover new test cases
  • Loading branch information
eyurtsev committed Jan 4, 2024
1 parent 817b84d commit bf0b3cc
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 11 deletions.
36 changes: 25 additions & 11 deletions libs/core/langchain_core/utils/html.py
Expand Up @@ -67,23 +67,37 @@ def extract_sub_links(
Returns:
List[str]: sub links
"""
base_url = base_url if base_url is not None else url
base_url_to_use = base_url if base_url is not None else url
parsed_base_url = urlparse(base_url_to_use)
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
parsed_link = urlparse(link)
# Some may be absolute links like https://to/path
if link.startswith("http"):
absolute_paths.add(link)
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
absolute_path = f"{urlparse(url).scheme}:{link}"
else:
absolute_paths.add(urljoin(url, link))
res = []
absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path)

results = []
for path in absolute_paths:
if any(path.startswith(exclude) for exclude in exclude_prefixes):
continue
if prevent_outside and not path.startswith(base_url):
if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes):
continue
res.append(path)
return res

if prevent_outside:
parsed_path = urlparse(path)

if parsed_base_url.netloc != parsed_path.netloc:
continue

# Will take care of verifying rest of path after netloc
# if it's more specific
if not path.startswith(base_url_to_use):
continue

results.append(path)
return results
27 changes: 27 additions & 0 deletions libs/core/tests/unit_tests/utils/test_html.py
Expand Up @@ -156,3 +156,30 @@ def test_extract_sub_links_exclude() -> None:
)
)
assert actual == expected


def test_prevent_outside() -> None:
"""Test that prevent outside compares against full base URL."""
html = (
'<a href="https://foobar.comic.com">BAD</a>'
'<a href="https://foobar.comic:9999">BAD</a>'
'<a href="https://foobar.com:9999">BAD</a>'
'<a href="http://foobar.com:9999/">BAD</a>'
'<a href="https://foobar.com/OK">OK</a>'
'<a href="http://foobar.com/BAD">BAD</a>' # Change in scheme is not OK here
)

expected = sorted(
[
"https://foobar.com/OK",
]
)
actual = sorted(
extract_sub_links(
html,
"https://foobar.com/hello/bill.html",
base_url="https://foobar.com",
prevent_outside=True,
)
)
assert actual == expected

0 comments on commit bf0b3cc

Please sign in to comment.