Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #1053 from internetarchive/hotfix/amz-affiliate-links
First round of fixing cache, no bots, re-cache
  • Loading branch information
mekarpeles committed Aug 21, 2018
2 parents 5633e5d + 99ba7d2 commit ca7f0e7
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 16 deletions.
1 change: 1 addition & 0 deletions openlibrary/core/cache.py
Expand Up @@ -25,6 +25,7 @@
HOUR = 60 * 60
DAY = HOUR * 24
HALF_DAY = HOUR * 12
ONE_WEEK = DAY * 7


class memcache_memoize:
Expand Down
4 changes: 3 additions & 1 deletion openlibrary/core/lending.py
Expand Up @@ -94,7 +94,9 @@ def setup(config):
config_amz_api = config.get('amazon_api')

try:
amazon_api = AmazonAPI(config_amz_api.key, config_amz_api.secret, config_amz_api.id)
amazon_api = AmazonAPI(
config_amz_api.key, config_amz_api.secret,
config_amz_api.id, MaxQPS=0.9)
except AttributeError:
amazon_api = None

Expand Down
2 changes: 1 addition & 1 deletion openlibrary/macros/AffiliateLinks.html
Expand Up @@ -15,7 +15,7 @@
<a href="$amazon" title="Look for this edition for sale at Amazon" target="_blank">Amazon</a>
</td>
<td class="price">
$if prices:
$if prices and not is_bot():
$ amazon_metadata = get_amazon_metadata(isbn)
$if amazon_metadata and 'price' in amazon_metadata and amazon_metadata['price']:
<span name="price">$(amazon_metadata['price'])</span>
Expand Down
25 changes: 25 additions & 0 deletions openlibrary/plugins/openlibrary/code.py
Expand Up @@ -771,6 +771,30 @@ def GET(self):
result = delegate.RawText(result)
return result

def is_bot():
"""Generated on ol-www1 within /var/log/nginx with:
cat access.log | grep -oh "; \w*[bB]ot" | sort --unique | awk '{print tolower($2)}'
cat access.log | grep -oh "; \w*[sS]pider" | sort --unique | awk '{print tolower($2)}'
Manually removed singleton `bot` (to avoid overly complex grep regex)
"""
user_agent_bots = [
'sputnikbot', 'dotbot', 'semrushbot',
'googlebot', 'yandexbot', 'monsidobot', 'kazbtbot',
'seznambot', 'dubbotbot', '360spider', 'redditbot',
'yandexmobilebot', 'linkdexbot', 'musobot', 'mojeekbot',
'focuseekbot', 'behloolbot', 'startmebot',
'yandexaccessibilitybot', 'uptimerobot', 'femtosearchbot',
'pinterestbot', 'toutiaospider', 'yoozbot', 'parsijoobot',
'equellaurlbot', 'donkeybot', 'paperlibot', 'nsrbot',
'discordbot', 'ahrefsbot', '`googlebot', 'coccocbot',
'buzzbot', 'laserlikebot', 'baiduspider', 'bingbot',
'mj12bot', 'yoozbotadsbot'
]
user_agent = web.ctx.env['HTTP_USER_AGENT'].lower()
return any([bot in user_agent for bot in user_agent_bots])

def setup_template_globals():
web.template.Template.globals.update({
"sorted": sorted,
Expand All @@ -781,6 +805,7 @@ def setup_template_globals():
"random": random.Random(),

# bad use of globals
"is_bot": is_bot,
"time": time,
"input": web.input,
"dumps": simplejson.dumps,
Expand Down
58 changes: 44 additions & 14 deletions openlibrary/plugins/upstream/code.py
Expand Up @@ -8,6 +8,7 @@
import re
import datetime
import urllib2
import logging

from infogami import config
from infogami.infobase import client
Expand All @@ -31,7 +32,8 @@
import recentchanges
import merge_authors

HALF_DAY = 60 * 60 * 12
logger = logging.getLogger("openlibrary.plugins")

BETTERWORLDBOOKS_API_URL = 'http://products.betterworldbooks.com/service.aspx?ItemId='

if not config.get('coverstore_url'):
Expand Down Expand Up @@ -94,31 +96,59 @@ def get_amazon_metadata(isbn):
if isbn:
return cached_get_amazon_metadata(isbn)
except Exception:
return {}
return None

def _get_amazon_metadata(isbn):
if not amazon_api:
return '' # likely dev instance and keys not set

try:
if not amazon_api:
logger.info("Amazon keys likely misconfigured")
raise Exception
product = amazon_api.lookup(ItemId=isbn)
except Exception:
return {'price': ''}
used = product._safe_get_element_text('OfferSummary.LowestUsedPrice.Amount')
new = product._safe_get_element_text('OfferSummary.LowestNewPrice.Amount')
except Exception as e:
return None

price, qlt = (None, None)
used = product._safe_get_element_text('OfferSummary.LowestUsedPrice.Amount')
new = product._safe_get_element_text('OfferSummary.LowestNewPrice.Amount')

# prioritize lower prices and newer, all things being equal
if used and new:
price, qlt = (used, 'used') if int(used) < int(new) else (new, 'new')
# accept whichever is available
elif used or new:
price, qlt = (used, 'used') if used else (new, 'new')

price_fmt = None
if price and qlt:
price = '{:00,.2f}'.format(int(price)/100.)
price_fmt = "$%s (%s)" % (price, qlt)

return {
'price': "$%s (%s)" % ('{:00,.2f}'.format(int(price)/100.), qlt) if price and qlt else ''
'price': price_fmt
}

cached_get_amazon_metadata = cache.memcache_memoize(
_get_amazon_metadata, "upstream.code._get_amazon_metadata", timeout=HALF_DAY)

def cached_get_amazon_metadata(*args, **kwargs):
"""If the cached data is `None`, likely a 503 throttling occurred on
Amazon's side. Try again to fetch the value instead of using the
cached value. It may 503 again, in which case the next access of
this page will trigger another re-cache. If the amazon API call
succeeds but the book has no price data, then {"price": None} will
be cached as to not trigger a re-cache (only the value `None`
will cause re-cache)
"""
# fetch/compose a cache controller obj for
# "upstream.code._get_amazon_metadata"
memoized_get_amazon_metadata = cache.memcache_memoize(
_get_amazon_metadata, "upstream.code._get_amazon_metadata",
timeout=cache.ONE_WEEK)
# fetch cached value from this controller
result = memoized_get_amazon_metadata(*args, **kwargs)
if result is None:
# recache / update this controller's cached value
# (corresponding to these input args)
result = memoized_get_amazon_metadata.update(*args, **kwargs)[0]
return result

@public
def get_betterworldbooks_metadata(isbn):
Expand Down Expand Up @@ -153,7 +183,7 @@ def _get_betterworldbooks_metadata(isbn):
if price and _price and _price < price:
price = _price
qlt = 'new'

return {
'url': product_url[0] if product_url else None,
'price': price,
Expand All @@ -169,7 +199,7 @@ def _get_betterworldbooks_metadata(isbn):


cached_get_betterworldbooks_metadata = cache.memcache_memoize(
_get_betterworldbooks_metadata, "upstream.code._get_betterworldbooks_metadata", timeout=HALF_DAY)
_get_betterworldbooks_metadata, "upstream.code._get_betterworldbooks_metadata", timeout=cache.HALF_DAY)

class DynamicDocument:
"""Dynamic document is created by concatinating various rawtext documents in the DB.
Expand Down

0 comments on commit ca7f0e7

Please sign in to comment.