Skip to content

Commit

Permalink
Amazon metadata: When filtering search engine results by title ignore…
Browse files Browse the repository at this point in the history
… words of the title that are purely punctuation
  • Loading branch information
kovidgoyal committed Jun 17, 2024
1 parent 43121af commit 44bceac
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/calibre/ebooks/metadata/sources/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,7 +1082,7 @@ def parse_language(self, pd):
class Amazon(Source):

name = 'Amazon.com'
version = (1, 3, 7)
version = (1, 3, 8)
minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon')

Expand Down Expand Up @@ -1684,13 +1684,20 @@ def filter_result(self, title, authors, identifiers, mi, log): # {{{
if not self.use_search_engine:
return True
if title is not None:
import regex
only_punctuation_pat = regex.compile(r'^\p{P}+$')

def tokenize_title(x):
return icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
ans = icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
if only_punctuation_pat.match(ans) is not None:
ans = ''
return ans

tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}
tokens.discard('')
if tokens:
result_tokens = {tokenize_title(x) for x in mi.title.split()}
result_tokens.discard('')
if not tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'as its title does not match')
return False
Expand Down

0 comments on commit 44bceac

Please sign in to comment.