diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py index 95e1c3c2f45787..bbea15f0fa4350 100644 --- a/libs/core/langchain_core/utils/html.py +++ b/libs/core/langchain_core/utils/html.py @@ -67,23 +67,37 @@ def extract_sub_links( Returns: List[str]: sub links """ - base_url = base_url if base_url is not None else url + base_url_to_use = base_url if base_url is not None else url + parsed_base_url = urlparse(base_url_to_use) all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: + parsed_link = urlparse(link) # Some may be absolute links like https://to/path - if link.startswith("http"): - absolute_paths.add(link) + if parsed_link.scheme == "http" or parsed_link.scheme == "https": + absolute_path = link # Some may have omitted the protocol like //to/path elif link.startswith("//"): - absolute_paths.add(f"{urlparse(url).scheme}:{link}") + absolute_path = f"{urlparse(url).scheme}:{link}" else: - absolute_paths.add(urljoin(url, link)) - res = [] + absolute_path = urljoin(url, parsed_link.path) + absolute_paths.add(absolute_path) + + results = [] for path in absolute_paths: - if any(path.startswith(exclude) for exclude in exclude_prefixes): - continue - if prevent_outside and not path.startswith(base_url): + if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes): continue - res.append(path) - return res + + if prevent_outside: + parsed_path = urlparse(path) + + if parsed_base_url.netloc != parsed_path.netloc: + continue + + # Will take care of verifying rest of path after netloc + # if it's more specific + if not path.startswith(base_url_to_use): + continue + + results.append(path) + return results diff --git a/libs/core/tests/unit_tests/utils/test_html.py b/libs/core/tests/unit_tests/utils/test_html.py index 117d3698a74d07..a2c80f6e65484d 100644 --- a/libs/core/tests/unit_tests/utils/test_html.py +++ b/libs/core/tests/unit_tests/utils/test_html.py @@ -156,3 +156,30 @@ def test_extract_sub_links_exclude() -> None: ) ) assert actual == expected + + +def test_prevent_outside() -> None: + """Test that prevent outside compares against full base URL.""" + html = ( + 'BAD' + 'BAD' + 'BAD' + 'BAD' + 'OK' + 'BAD' # Change in scheme is not OK here + ) + + expected = sorted( + [ + "https://foobar.com/OK", + ] + ) + actual = sorted( + extract_sub_links( + html, + "https://foobar.com/hello/bill.html", + base_url="https://foobar.com", + prevent_outside=True, + ) + ) + assert actual == expected