Skip to content

Commit bf0b3cc

Browse files
authored
core[patch]: Further restrict recursive URL loader (#15559)
Includes code from this PR: HEAD...m0kr4n3:security/fix_ssrf with additional fixes Unit tests cover new test cases
1 parent 817b84d commit bf0b3cc

File tree

2 files changed

+52
-11
lines changed

2 files changed

+52
-11
lines changed

Diff for: libs/core/langchain_core/utils/html.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -67,23 +67,37 @@ def extract_sub_links(
6767
Returns:
6868
List[str]: sub links
6969
"""
70-
base_url = base_url if base_url is not None else url
70+
base_url_to_use = base_url if base_url is not None else url
71+
parsed_base_url = urlparse(base_url_to_use)
7172
all_links = find_all_links(raw_html, pattern=pattern)
7273
absolute_paths = set()
7374
for link in all_links:
75+
parsed_link = urlparse(link)
7476
# Some may be absolute links like https://to/path
75-
if link.startswith("http"):
76-
absolute_paths.add(link)
77+
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
78+
absolute_path = link
7779
# Some may have omitted the protocol like //to/path
7880
elif link.startswith("//"):
79-
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
81+
absolute_path = f"{urlparse(url).scheme}:{link}"
8082
else:
81-
absolute_paths.add(urljoin(url, link))
82-
res = []
83+
absolute_path = urljoin(url, parsed_link.path)
84+
absolute_paths.add(absolute_path)
85+
86+
results = []
8387
for path in absolute_paths:
84-
if any(path.startswith(exclude) for exclude in exclude_prefixes):
85-
continue
86-
if prevent_outside and not path.startswith(base_url):
88+
if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes):
8789
continue
88-
res.append(path)
89-
return res
90+
91+
if prevent_outside:
92+
parsed_path = urlparse(path)
93+
94+
if parsed_base_url.netloc != parsed_path.netloc:
95+
continue
96+
97+
# Will take care of verifying rest of path after netloc
98+
# if it's more specific
99+
if not path.startswith(base_url_to_use):
100+
continue
101+
102+
results.append(path)
103+
return results

Diff for: libs/core/tests/unit_tests/utils/test_html.py

+27
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,30 @@ def test_extract_sub_links_exclude() -> None:
156156
)
157157
)
158158
assert actual == expected
159+
160+
161+
def test_prevent_outside() -> None:
162+
"""Test that prevent outside compares against full base URL."""
163+
html = (
164+
'<a href="https://foobar.comic.com">BAD</a>'
165+
'<a href="https://foobar.comic:9999">BAD</a>'
166+
'<a href="https://foobar.com:9999">BAD</a>'
167+
'<a href="http://foobar.com:9999/">BAD</a>'
168+
'<a href="https://foobar.com/OK">OK</a>'
169+
'<a href="http://foobar.com/BAD">BAD</a>' # Change in scheme is not OK here
170+
)
171+
172+
expected = sorted(
173+
[
174+
"https://foobar.com/OK",
175+
]
176+
)
177+
actual = sorted(
178+
extract_sub_links(
179+
html,
180+
"https://foobar.com/hello/bill.html",
181+
base_url="https://foobar.com",
182+
prevent_outside=True,
183+
)
184+
)
185+
assert actual == expected

0 commit comments

Comments
 (0)