Skip to content

Commit

Permalink
Use a single user agent string for amazon
Browse files Browse the repository at this point in the history
Amazon returns CAPTCHA pages based on user agent sniffing, so use a
common user agent (IE II)
  • Loading branch information
kovidgoyal committed Jun 5, 2016
1 parent 68cc6ae commit 19c8784
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions src/calibre/ebooks/metadata/sources/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
from Queue import Queue, Empty


from calibre import as_unicode, random_user_agent
from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang

class CaptchaError(Exception):
pass

def parse_details_page(url, log, timeout, browser, domain):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
Expand Down Expand Up @@ -299,6 +302,8 @@ def get_details(self):

def parse_details(self, raw, root):
asin = parse_asin(root, self.log, self.url)
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
if self.testing:
import tempfile, uuid
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
Expand Down Expand Up @@ -764,9 +769,7 @@ def test_fields(self, mi):

@property
def user_agent(self):
# Pass in an index to random_user_agent() to test with a particular
# user agent
return random_user_agent()
return 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'

def save_settings(self, *args, **kwargs):
Source.save_settings(self, *args, **kwargs)
Expand Down Expand Up @@ -985,6 +988,8 @@ def title_ok(title):
url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url)
matches.append(url)
break
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')

# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
Expand Down

0 comments on commit 19c8784

Please sign in to comment.