From 9980674cf625c6fbf9adad3f1bb7bd70fd67c21f Mon Sep 17 00:00:00 2001 From: jim winstead Date: Mon, 4 Mar 2024 11:02:51 -0800 Subject: [PATCH] Fix behavior of "auto" parser to be first parser returning links wins This is how it was actually documented to work in the code, but had been intentionally broken to just try to work with as much as possible. With more rigorous testing of the parsers and add's behavior, we shouldn't need that. History explained here: https://github.com/ArchiveBox/ArchiveBox/issues/1363#issuecomment-1970294015 With this, we pass 9 of the 19 tests in tests/parser. --- archivebox/parsers/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c6f2f382f..ba5733def 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -130,10 +130,8 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, if not parsed_links: raise Exception(f'No links found using {parser_name} parser') - # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') - if len(parsed_links) > len(most_links): - most_links = parsed_links - best_parser_name = parser_name + print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') + break except Exception as err: # noqa # Parsers are tried one by one down the list, and the first one @@ -143,8 +141,9 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # raise pass + timer.end() - return most_links, best_parser_name + return parsed_links, parser_name @enforce_types