Applying suggestion from 1st review

kovidgoyal · Nov 2, 2022 · 2b64808 · 2b64808
1 parent 7633531
commit 2b64808
Show file tree

Hide file tree

Showing 42 changed files with 1,286 additions and 1,525 deletions.
diff --git a/src/calibre/gui2/store/__init__.py b/src/calibre/gui2/store/__init__.py
@@ -2,9 +2,86 @@
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+from contextlib import closing
+from time import perf_counter
+
+from lxml import html
+
+from calibre import browser as create_browser, prints
+from calibre.constants import DEBUG
+from calibre.scraper.simple import read_url
 from calibre.utils.filenames import ascii_filename
 
 
+def browser_get_url(url, timeout, browser=None, user_agent=None, headers=None, data=None, novisit=False, html_parser=None, save_html_to=None):
+    """
+    Retrieve the content at the given HTTP URL,
+    and measure the time it takes to do so in DEBUG mode.
+    Uses mechanize.Browser
+
+    :param url: a URL string.
+
+    :param timeout: a numerical timeout in seconds for the HTTP request.
+
+    :param browser: an optional existing mechanize.Browser instance.
+    If not provided, a new one will be created.
+
+    :param user_agent: optional User-Agent to use if no "browser" parameter is provided.
+
+    :param headers: optional list of HTTP headers to set on the request
+
+    :param data: optional query parameters
+
+    :param novisit: optional boolean indicating to use mechanize "novisit" method
+    when fetching web pages.
+
+    :param save_html_to: an optional file path where to save the web page content.
+
+    :param html_parser: an optional function to parse the HTML string.
+    By default: lxml.html.fromstring
+
+    :return: a parsed HTML element/document
+    """
+    start_time = perf_counter()
+    if browser is None:
+        browser = create_browser(user_agent=user_agent)
+    if headers:
+        browser.addheaders.extend(headers)
+    browser_open = browser.open_novisit if novisit else browser.open
+    with closing(browser_open(url, data=data, timeout=timeout)) as web_page:
+        html_content = web_page.read()
+    if save_html_to:
+        with open(save_html_to, 'wb') as html_file:
+            html_file.write(raw_content)
+    if not html_parser:
+        html_parser = html.fromstring
+    html_parsed = html_parser(html_content)
+    if DEBUG:
+        duration = perf_counter() - start_time
+        prints(f'browser_get_url took {duration:.2f}s for URL {url}')
+    return html_parsed
+
+
+def http_get_url(storage, url, timeout):
+    """
+    Retrieve the content at the given HTTP URL,
+    and measure the time it takes to do so in DEBUG mode.
+    Uses qt.webengine and hence the chromium network stack.
+
+    :param url: a URL string.
+
+    :param timeout: a numerical timeout in seconds for the HTTP request.
+
+    :return: the HTML content as a string
+    """
+    start_time = perf_counter()
+    html_content = read_url(storage, url, timeout)
+    if DEBUG:
+        duration = perf_counter() - start_time
+        prints(f"http_get_url took {duration:.2f}s for URL {url}")
+    return html_content
+
+
 class StorePlugin:  # {{{
 
     '''

diff --git a/src/calibre/gui2/store/amazon_base.py b/src/calibre/gui2/store/amazon_base.py
@@ -2,7 +2,6 @@
 # vim:fileencoding=utf-8
 # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
 
-from qt.core import QUrl
 from threading import Lock
 from time import monotonic
 
@@ -45,7 +44,7 @@ class AmazonStore:
 
     def open(self, parent=None, detail_item=None, external=False):
         store_link = get_method('get_store_link_amazon')(self, detail_item)
-        open_url(QUrl(store_link))
+        open_url(store_link)
 
     def search(self, query, max_results=10, timeout=60):
         for result in get_method('search_amazon')(self, query, max_results=max_results, timeout=timeout):

diff --git a/src/calibre/gui2/store/amazon_live.py b/src/calibre/gui2/store/amazon_live.py
@@ -6,7 +6,7 @@
 from lxml import etree, html
 from urllib.parse import urlencode
 
-from calibre.scraper.simple import read_url
+from calibre.gui2.store import http_get_url
 from calibre.gui2.store.search_result import SearchResult
 
 
@@ -26,7 +26,7 @@ def asbytes(x):
     url = self.SEARCH_BASE_URL + '?' + urlencode(uquery)
 
     counter = max_results
-    raw = read_url(self.scraper_storage, url, timeout=timeout)
+    raw = http_get_url(self.scraper_storage, url, timeout=timeout)
     if write_html_to is not None:
         with open(write_html_to, 'w') as f:
             f.write(raw)
@@ -85,7 +85,7 @@ def parse_details_amazon(self, idata, search_result):
 
 def get_details_amazon(self, search_result, timeout):
     url = self.DETAILS_URL + search_result.detail_item
-    raw = read_url(self.scraper_storage, url, timeout=timeout)
+    raw = http_get_url(self.scraper_storage, url, timeout=timeout)
     idata = html.fromstring(raw)
     return parse_details_amazon(self, idata, search_result)
 

diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py
@@ -4,8 +4,6 @@
 
 from contextlib import closing
 
-from qt.core import QUrl
-
 from calibre import (browser, guess_extension)
 from calibre.gui2 import open_url
 from calibre.utils.xml_parse import safe_xml_fromstring
@@ -88,7 +86,7 @@ def open(self, parent=None, detail_item=None, external=False):
             return
 
         if external or self.config.get('open_external', False):
-            open_url(QUrl(detail_item if detail_item else self.web_url))
+            open_url(detail_item if detail_item else self.web_url)
         else:
             d = WebStoreDialog(self.gui, self.web_url, parent, detail_item, create_browser=self.create_browser)
             d.setWindowTitle(self.name)

diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py
@@ -5,19 +5,13 @@
 
 store_version = 15  # Needed for dynamic plugin loading
 
-from contextlib import closing
 try:
     from urllib.parse import urlencode
 except ImportError:
     from urllib import urlencode
 
-from lxml import html
-
-from qt.core import QUrl
-
-from calibre import browser
 from calibre.gui2 import open_url
-from calibre.gui2.store import StorePlugin
+from calibre.gui2.store import browser_get_url, StorePlugin
 from calibre.gui2.store.search_result import SearchResult
 
 SEARCH_BASE_URL = 'https://www.amazon.de/s/'
@@ -49,102 +43,93 @@ def asbytes(x):
         return x
     uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
     url = base_url + '?' + urlencode(uquery)
-    br = browser(user_agent=get_user_agent())
+
+    doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to)
+
+    try:
+        results = doc.xpath('//div[@id="atfResults" and @class]')[0]
+    except IndexError:
+        return
+
+    if 's-result-list-parent-container' in results.get('class', ''):
+        data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
+        format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
+        asin_xpath = '@data-asin'
+        cover_xpath =  "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
+        title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
+        author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
+        price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
+                       ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
+    else:
+        return
 
     counter = max_results
-    with closing(br.open(url, timeout=timeout)) as f:
-        raw = f.read()
-        if write_html_to is not None:
-            with open(write_html_to, 'wb') as f:
-                f.write(raw)
-        doc = html.fromstring(raw)
-        try:
-            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
-        except IndexError:
-            return
-
-        if 's-result-list-parent-container' in results.get('class', ''):
-            data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
-            format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
-            asin_xpath = '@data-asin'
-            cover_xpath =  "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
-            title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
-            author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
-            price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
-                           ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
+    for data in doc.xpath(data_xpath):
+        if counter <= 0:
+            break
+
+        # Even though we are searching digital-text only Amazon will still
+        # put in results for non Kindle books (author pages). Se we need
+        # to explicitly check if the item is a Kindle book and ignore it
+        # if it isn't.
+        format = ''.join(data.xpath(format_xpath))
+        if 'kindle' not in format.lower():
+            continue
+
+        # We must have an asin otherwise we can't easily reference the
+        # book later.
+        asin = data.xpath(asin_xpath)
+        if asin:
+            asin = asin[0]
         else:
-            return
-
-        for data in doc.xpath(data_xpath):
-            if counter <= 0:
-                break
-
-            # Even though we are searching digital-text only Amazon will still
-            # put in results for non Kindle books (author pages). Se we need
-            # to explicitly check if the item is a Kindle book and ignore it
-            # if it isn't.
-            format = ''.join(data.xpath(format_xpath))
-            if 'kindle' not in format.lower():
-                continue
-
-            # We must have an asin otherwise we can't easily reference the
-            # book later.
-            asin = data.xpath(asin_xpath)
-            if asin:
-                asin = asin[0]
-            else:
-                continue
+            continue
 
-            cover_url = ''.join(data.xpath(cover_xpath))
+        cover_url = ''.join(data.xpath(cover_xpath))
 
-            title = ''.join(data.xpath(title_xpath))
-            author = ''.join(data.xpath(author_xpath))
-            try:
-                author = author.split('by ', 1)[1].split(" (")[0]
-            except:
-                pass
+        title = ''.join(data.xpath(title_xpath))
+        author = ''.join(data.xpath(author_xpath))
+        try:
+            author = author.split('by ', 1)[1].split(" (")[0]
+        except:
+            pass
 
-            price = ''.join(data.xpath(price_xpath))
+        price = ''.join(data.xpath(price_xpath))
 
-            counter -= 1
+        counter -= 1
 
-            s = SearchResult()
-            s.cover_url = cover_url.strip()
-            s.title = title.strip()
-            s.author = author.strip()
-            s.price = price.strip()
-            s.detail_item = asin.strip()
-            s.formats = 'Kindle'
+        s = SearchResult()
+        s.cover_url = cover_url.strip()
+        s.title = title.strip()
+        s.author = author.strip()
+        s.price = price.strip()
+        s.detail_item = asin.strip()
+        s.formats = 'Kindle'
 
-            yield s
+        yield s
 
 
 class AmazonKindleStore(StorePlugin):
 
     def open(self, parent=None, detail_item=None, external=False):
         store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK
-        open_url(QUrl(store_link))
+        open_url(store_link)
 
     def search(self, query, max_results=10, timeout=60):
         for result in search_amazon(query, max_results=max_results, timeout=timeout):
             yield result
 
     def get_details(self, search_result, timeout):
-        url = DETAILS_URL
-
-        br = browser(user_agent=get_user_agent())
-        with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
-            idata = html.fromstring(nf.read())
-            if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
+        idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent())
+        if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
+                       DRM_SEARCH_TEXT + '")])'):
+            if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
+                           DRM_FREE_TEXT + '") and contains(b, "' +
                            DRM_SEARCH_TEXT + '")])'):
-                if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
-                               DRM_FREE_TEXT + '") and contains(b, "' +
-                               DRM_SEARCH_TEXT + '")])'):
-                    search_result.drm = SearchResult.DRM_UNLOCKED
-                else:
-                    search_result.drm = SearchResult.DRM_UNKNOWN
+                search_result.drm = SearchResult.DRM_UNLOCKED
             else:
-                search_result.drm = SearchResult.DRM_LOCKED
+                search_result.drm = SearchResult.DRM_UNKNOWN
+        else:
+            search_result.drm = SearchResult.DRM_LOCKED
         return True