@@ -67,23 +67,37 @@ def extract_sub_links(
6767 Returns:
6868 List[str]: sub links
6969 """
70- base_url = base_url if base_url is not None else url
70+ base_url_to_use = base_url if base_url is not None else url
71+ parsed_base_url = urlparse (base_url_to_use )
7172 all_links = find_all_links (raw_html , pattern = pattern )
7273 absolute_paths = set ()
7374 for link in all_links :
75+ parsed_link = urlparse (link )
7476 # Some may be absolute links like https://to/path
75- if link . startswith ( "http" ) :
76- absolute_paths . add ( link )
77+ if parsed_link . scheme == "http" or parsed_link . scheme == "https" :
78+ absolute_path = link
7779 # Some may have omitted the protocol like //to/path
7880 elif link .startswith ("//" ):
79- absolute_paths . add ( f"{ urlparse (url ).scheme } :{ link } " )
81+ absolute_path = f"{ urlparse (url ).scheme } :{ link } "
8082 else :
81- absolute_paths .add (urljoin (url , link ))
82- res = []
83+ absolute_path = urljoin (url , parsed_link .path )
84+ absolute_paths .add (absolute_path )
85+
86+ results = []
8387 for path in absolute_paths :
84- if any (path .startswith (exclude ) for exclude in exclude_prefixes ):
85- continue
86- if prevent_outside and not path .startswith (base_url ):
88+ if any (path .startswith (exclude_prefix ) for exclude_prefix in exclude_prefixes ):
8789 continue
88- res .append (path )
89- return res
90+
91+ if prevent_outside :
92+ parsed_path = urlparse (path )
93+
94+ if parsed_base_url .netloc != parsed_path .netloc :
95+ continue
96+
97+ # Will take care of verifying rest of path after netloc
98+ # if it's more specific
99+ if not path .startswith (base_url_to_use ):
100+ continue
101+
102+ results .append (path )
103+ return results
0 commit comments