Permalink
Browse files

Merge pull request #1053 from internetarchive/hotfix/amz-affiliate-links

First round of fixing cache, no bots, re-cache
  • Loading branch information...
mekarpeles committed Aug 21, 2018
2 parents 5633e5d + 99ba7d2 commit ca7f0e773791764adf4e9f4cf248cdc6cb620884
@@ -25,6 +25,7 @@
HOUR = 60 * 60
DAY = HOUR * 24
HALF_DAY = HOUR * 12
ONE_WEEK = DAY * 7
class memcache_memoize:
@@ -94,7 +94,9 @@ def setup(config):
config_amz_api = config.get('amazon_api')
try:
amazon_api = AmazonAPI(config_amz_api.key, config_amz_api.secret, config_amz_api.id)
amazon_api = AmazonAPI(
config_amz_api.key, config_amz_api.secret,
config_amz_api.id, MaxQPS=0.9)
except AttributeError:
amazon_api = None
@@ -15,7 +15,7 @@
<a href="$amazon" title="Look for this edition for sale at Amazon" target="_blank">Amazon</a>
</td>
<td class="price">
$if prices:
$if prices and not is_bot():
$ amazon_metadata = get_amazon_metadata(isbn)
$if amazon_metadata and 'price' in amazon_metadata and amazon_metadata['price']:
<span name="price">$(amazon_metadata['price'])</span>
@@ -771,6 +771,30 @@ def GET(self):
result = delegate.RawText(result)
return result
def is_bot():
"""Generated on ol-www1 within /var/log/nginx with:
cat access.log | grep -oh "; \w*[bB]ot" | sort --unique | awk '{print tolower($2)}'
cat access.log | grep -oh "; \w*[sS]pider" | sort --unique | awk '{print tolower($2)}'
Manually removed singleton `bot` (to avoid overly complex grep regex)
"""
user_agent_bots = [
'sputnikbot', 'dotbot', 'semrushbot',
'googlebot', 'yandexbot', 'monsidobot', 'kazbtbot',
'seznambot', 'dubbotbot', '360spider', 'redditbot',
'yandexmobilebot', 'linkdexbot', 'musobot', 'mojeekbot',
'focuseekbot', 'behloolbot', 'startmebot',
'yandexaccessibilitybot', 'uptimerobot', 'femtosearchbot',
'pinterestbot', 'toutiaospider', 'yoozbot', 'parsijoobot',
'equellaurlbot', 'donkeybot', 'paperlibot', 'nsrbot',
'discordbot', 'ahrefsbot', '`googlebot', 'coccocbot',
'buzzbot', 'laserlikebot', 'baiduspider', 'bingbot',
'mj12bot', 'yoozbotadsbot'
]
user_agent = web.ctx.env['HTTP_USER_AGENT'].lower()
return any([bot in user_agent for bot in user_agent_bots])
def setup_template_globals():
web.template.Template.globals.update({
"sorted": sorted,
@@ -781,6 +805,7 @@ def setup_template_globals():
"random": random.Random(),
# bad use of globals
"is_bot": is_bot,
"time": time,
"input": web.input,
"dumps": simplejson.dumps,
@@ -8,6 +8,7 @@
import re
import datetime
import urllib2
import logging
from infogami import config
from infogami.infobase import client
@@ -31,7 +32,8 @@
import recentchanges
import merge_authors
HALF_DAY = 60 * 60 * 12
logger = logging.getLogger("openlibrary.plugins")
BETTERWORLDBOOKS_API_URL = 'http://products.betterworldbooks.com/service.aspx?ItemId='
if not config.get('coverstore_url'):
@@ -94,31 +96,59 @@ def get_amazon_metadata(isbn):
if isbn:
return cached_get_amazon_metadata(isbn)
except Exception:
return {}
return None
def _get_amazon_metadata(isbn):
if not amazon_api:
return '' # likely dev instance and keys not set
try:
if not amazon_api:
logger.info("Amazon keys likely misconfigured")
raise Exception
product = amazon_api.lookup(ItemId=isbn)
except Exception:
return {'price': ''}
used = product._safe_get_element_text('OfferSummary.LowestUsedPrice.Amount')
new = product._safe_get_element_text('OfferSummary.LowestNewPrice.Amount')
except Exception as e:
return None
price, qlt = (None, None)
used = product._safe_get_element_text('OfferSummary.LowestUsedPrice.Amount')
new = product._safe_get_element_text('OfferSummary.LowestNewPrice.Amount')
# prioritize lower prices and newer, all things being equal
if used and new:
price, qlt = (used, 'used') if int(used) < int(new) else (new, 'new')
# accept whichever is available
elif used or new:
price, qlt = (used, 'used') if used else (new, 'new')
price_fmt = None
if price and qlt:
price = '{:00,.2f}'.format(int(price)/100.)
price_fmt = "$%s (%s)" % (price, qlt)
return {
'price': "$%s (%s)" % ('{:00,.2f}'.format(int(price)/100.), qlt) if price and qlt else ''
'price': price_fmt
}
cached_get_amazon_metadata = cache.memcache_memoize(
_get_amazon_metadata, "upstream.code._get_amazon_metadata", timeout=HALF_DAY)
def cached_get_amazon_metadata(*args, **kwargs):
"""If the cached data is `None`, likely a 503 throttling occurred on
Amazon's side. Try again to fetch the value instead of using the
cached value. It may 503 again, in which case the next access of
this page will trigger another re-cache. If the amazon API call
succeeds but the book has no price data, then {"price": None} will
be cached as to not trigger a re-cache (only the value `None`
will cause re-cache)
"""
# fetch/compose a cache controller obj for
# "upstream.code._get_amazon_metadata"
memoized_get_amazon_metadata = cache.memcache_memoize(
_get_amazon_metadata, "upstream.code._get_amazon_metadata",
timeout=cache.ONE_WEEK)
# fetch cached value from this controller
result = memoized_get_amazon_metadata(*args, **kwargs)
if result is None:
# recache / update this controller's cached value
# (corresponding to these input args)
result = memoized_get_amazon_metadata.update(*args, **kwargs)[0]
return result
@public
def get_betterworldbooks_metadata(isbn):
@@ -153,7 +183,7 @@ def _get_betterworldbooks_metadata(isbn):
if price and _price and _price < price:
price = _price
qlt = 'new'
return {
'url': product_url[0] if product_url else None,
'price': price,
@@ -169,7 +199,7 @@ def _get_betterworldbooks_metadata(isbn):
cached_get_betterworldbooks_metadata = cache.memcache_memoize(
_get_betterworldbooks_metadata, "upstream.code._get_betterworldbooks_metadata", timeout=HALF_DAY)
_get_betterworldbooks_metadata, "upstream.code._get_betterworldbooks_metadata", timeout=cache.HALF_DAY)
class DynamicDocument:
"""Dynamic document is created by concatinating various rawtext documents in the DB.

0 comments on commit ca7f0e7

Please sign in to comment.