Skip to content

Commit

Permalink
api.py: buffer data in QueryGenerator
Browse files Browse the repository at this point in the history
QueryGenerator yields all items in resultdata, but there is no
guarantee that all data for an item arrived in one response.
Some data will appear in the following response, etc.
(see commit 3d2ca97)

This patch buffers results until all data are for an item are fetched.
This is based on the fact that the API, when query-continuing, keeps on
repeating the same pages until all requested data are fetched.

Change-Id: Iccb3a96b0248fdab0650edfda23d05ecec0dadbd
  • Loading branch information
Mpaa authored and jayvdb committed Nov 27, 2014
1 parent 5c9ec00 commit 826f592
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 37 deletions.
150 changes: 120 additions & 30 deletions pywikibot/data/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
#
__version__ = '$Id$'

from collections import Container, MutableMapping
from pywikibot.comms import http
from collections import Container, MutableMapping, Mapping
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from email.mime.nonmultipart import MIMENonMultipart
import datetime
import hashlib
Expand All @@ -24,6 +27,7 @@
import time

import pywikibot
from pywikibot.comms import http
from pywikibot import config, login
from pywikibot.tools import MediaWikiVersion as LV, deprecated, itergroup
from pywikibot.exceptions import (
Expand Down Expand Up @@ -1303,20 +1307,25 @@ def __init__(self, **kwargs):
self.limit = None
self.query_limit = self.api_limit
if "generator" in kwargs:
self.resultkey = "pages" # name of the "query" subelement key
else: # to look for when iterating
self.resultkey = "pages" # name of the "query" subelement key
else: # to look for when iterating
self.resultkey = self.modules[0]

# usually the (query-)continue key is the same as the querymodule,
# but not always
# API can return more than one query-continue key, if multiple properties
# are requested by the query, e.g.
# API can return more than one query-continue key, if multiple
# properties are requested by the query, e.g.
# "query-continue":{
# "langlinks":{"llcontinue":"12188973|pt"},
# "templates":{"tlcontinue":"310820|828|Namespace_detect"}}
# self.continuekey is a list
self.continuekey = self.modules

# if self.resultkey == 'pages', result must be buffered, as
# there is no guarantee that all data for an item arrived in one
# response. Some data will appear in the following response, etc.
self.query_buffer = OrderedDict()

def set_query_increment(self, value):
"""Set the maximum number of items to be retrieved per API query.
Expand Down Expand Up @@ -1397,17 +1406,65 @@ def _add_continues(self, continue_pair):
value = str(value)
self.request[key] = value

def querydata_update(self, orig_dict, new_dict):
"""Update nested data structure.
See http://stackoverflow.com/questions/3232943/
update-value-of-a-nested-dictionary-of-varying-depth
"""
for key, val in new_dict.items():
if isinstance(val, Mapping):
tmp = self.querydata_update(orig_dict.get(key, {}), val)
orig_dict[key] = tmp
elif isinstance(val, list):
orig_dict[key] = orig_dict.get(key, []) + val
else:
orig_dict[key] = new_dict[key]
return orig_dict

def __iter__(self):
"""Submit request and iterate the response based on self.resultkey.
Continues response as needed until limit (if any) is reached.
Results are not yielded until all data for an entry is collected.
This can happen when results for a page are fetched with several
requests, in case of (query-)continue.
This is based on the fact that the API, when continuing,
keeps on repeating the same pages until all requested data
are fetched.
"""
previous_result_had_data = True
prev_limit = new_limit = None

def count(start=0, step=1):
"""Return iterator of evenly spaced values.
Taken from itertools.count() docs, step arg not supported in py2.6.
"""
n = start
while True:
yield n
n += step

# Counters to generate unique keys for query_buffer dictionary.
pos_key_gen = count()
neg_key_gen = count(-1, step=-1)

# Negative keys (e.g. u'-1') for missing pages
# (e.g. imagepages hosted on shared repos)
# needs to be replaced at each request, as they are
# reused for different pages in each request.
def unique_key(k):
return k if int(k) > 0 else str(next(neg_key_gen))

count = 0
while True:
# New limit computation shall be done at each iteration.
if self.query_limit is not None:
prev_limit = new_limit
if self.limit is None:
Expand All @@ -1424,9 +1481,9 @@ def __iter__(self):
else:
new_limit = None

if new_limit and \
"rvprop" in self.request \
and "content" in self.request["rvprop"]:
if (new_limit and
"rvprop" in self.request and
"content" in self.request["rvprop"]):
# queries that retrieve page content have lower limits
# Note: although API allows up to 500 pages for content
# queries, these sometimes result in server-side errors
Expand All @@ -1448,53 +1505,80 @@ def __iter__(self):
self.prefix + "limit",
self.request[self.prefix + "limit"]),
_logger)

# Get data.
if not hasattr(self, "data"):
self.data = self.request.submit()
if not self.data or not isinstance(self.data, dict):
pywikibot.debug(
u"%s: stopped iteration because no dict retrieved from api."
u"%s: stopped iteration; no dict retrieved from api"
% self.__class__.__name__,
_logger)
return
break
if "query" not in self.data:
pywikibot.debug(
u"%s: stopped iteration because 'query' not found in api "
u"response." % self.__class__.__name__,
u"%s: stopped iteration; 'query' not found in api response"
% self.__class__.__name__,
_logger)
pywikibot.debug(unicode(self.data), _logger)
return
break

if self.resultkey in self.data["query"]:
resultdata = self.data["query"][self.resultkey]

if "results" in resultdata:
# Used in https://www.mediawiki.org/wiki/API:Querypage
# resultdata is now a list, should be handled as such.
resultdata = resultdata["results"]

# Convert resultdata to ordered dictionary.
# Keys are used to understand if all data for an item are
# collected before yielding result from query_buffer.
# For lists it is not strictly necessary but simplifies the
# logic avoiding branches, as all data are handled in the
# same way.
if isinstance(resultdata, dict):
pywikibot.debug(u"%s received %s; limit=%s"
% (self.__class__.__name__,
list(resultdata.keys()),
self.limit),
_logger)
if "results" in resultdata:
resultdata = resultdata["results"]
elif "pageids" in self.data["query"]:
# this ensures that page data will be iterated
# in the same order as received from server
resultdata = [resultdata[k]
for k in self.data["query"]["pageids"]]
else:
resultdata = [resultdata[k]
for k in sorted(resultdata.keys())]

# With pageids, this ensures that page data will
# be iterated in the same order as received from server.
query_keys = self.data["query"].get(
"pageids", resultdata.keys())

resultdata = OrderedDict(
(unique_key(k), resultdata[k]) for k in query_keys)
else:
pywikibot.debug(u"%s received %s; limit=%s"
% (self.__class__.__name__,
resultdata,
self.limit),
_logger)
# Convert list to dict and assign arbitrary unique key.
resultdata = OrderedDict(zip(pos_key_gen, resultdata))

if "normalized" in self.data["query"]:
self.normalized = dict((item['to'], item['from'])
for item in
self.data["query"]["normalized"])
else:
self.normalized = {}
for item in resultdata:
yield self.result(item)

# If keys in query_buffer are not present in resultdata,
# page is complete and can be yielded and removed from buffer.
for pageid in self.query_buffer:
if pageid not in resultdata:
yield self.result(self.query_buffer[pageid])

for pageid, item in resultdata.items():
# Insert or update page in query_buffer.
querydata = self.query_buffer.get(pageid, {})
querydata = self.querydata_update(querydata, item)
self.query_buffer[pageid] = querydata

if isinstance(item, dict) and set(self.continuekey) & set(item.keys()):
# if we need to count elements contained in items in
# self.data["query"]["pages"], we want to count
Expand All @@ -1507,15 +1591,17 @@ def __iter__(self):
count += 1
# note: self.limit could be -1
if self.limit and self.limit > 0 and count >= self.limit:
# Flush buffer, if any, when completing the iteration.
for pageid in self.query_buffer:
yield self.result(self.query_buffer.pop(pageid))
return
# self.resultkey in data in last request.submit()
previous_result_had_data = True
else:
# if (query-)continue is present, self.resultkey might not have
# been fetched yet
if self.continue_name not in self.data:
# No results.
return
break # No results.
# self.resultkey not in data in last request.submit()
# only "(query-)continue" was retrieved.
previous_result_had_data = False
Expand All @@ -1525,12 +1611,16 @@ def __iter__(self):
del self.data # a new request is needed
continue
if self.continue_name not in self.data:
return
break
if self.continue_update():
return
break

del self.data # a new request with (query-)continue is needed

# Flush buffer, if any, when completing the iteration.
for pageid in self.query_buffer:
yield self.result(self.query_buffer[pageid])

def result(self, data):
"""Process result data as needed for particular subclass."""
return data
Expand Down
14 changes: 10 additions & 4 deletions tests/api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
__version__ = '$Id$'

import datetime
import sys
import types

import pywikibot.data.api as api
Expand Down Expand Up @@ -249,10 +250,15 @@ class TestDryPageGenerator(TestCase):

dry = True

# api.py sorts 'pages' using the string key, which is not a
# numeric comparison.
titles = ("Broadcaster (definition)", "Wiktionary", "Broadcaster.com",
"Wikipedia:Disambiguation")
# api previously sorted 'pages' using the string key, which is not a
# numeric comparison. It now orders them ... ??

if sys.version_info[0] == 2:
titles = ("Broadcaster.com", "Wikipedia:Disambiguation",
"Broadcaster (definition)", "Wiktionary")
else:
titles = ["Broadcaster (definition)", "Wikipedia:Disambiguation",
"Wiktionary", "Broadcaster.com"]

def setUp(self):
super(TestDryPageGenerator, self).setUp()
Expand Down
13 changes: 10 additions & 3 deletions tests/site_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,16 @@ def testExturlusage(self):
self.assertIsInstance(link, pywikibot.Page)
self.assertIn(link.namespace(), (2, 3))

def test_ancientpages(self):
"""Test the site.ancientpages() method."""
mysite = self.get_site()
wl = list(mysite.ancientpages(total=20))
self.assertLessEqual(len(wl), 20)
self.assertTrue(all(isinstance(data, tuple) for data in wl))

# TODO: test newimages, longpages, shortpages, ancientpages, unwatchedpages
# and the other following methods in site.py


class TestImageUsage(DefaultSiteTestCase):

Expand Down Expand Up @@ -1201,9 +1211,6 @@ def test_namespaces(self):
self.assertIsInstance(rndpage, pywikibot.Page)
self.assertIn(rndpage.namespace(), [6, 7])

# TODO: test newimages, longpages, shortpages, ancientpages, unwatchedpages
# and the other following methods in site.py


class TestSiteTokens(DefaultSiteTestCase):

Expand Down

0 comments on commit 826f592

Please sign in to comment.