From f43534fdad038a749ee9d014a0794fa261d21d66 Mon Sep 17 00:00:00 2001 From: esby Date: Thu, 21 Jun 2012 22:32:11 +0200 Subject: [PATCH 01/10] avoid calling _api_setup several times (which can happend in pull_commandat) --- src/mw/clicommands.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py index 1d0c46d..4414898 100644 --- a/src/mw/clicommands.py +++ b/src/mw/clicommands.py @@ -77,12 +77,15 @@ def _die_if_no_init(self): if self.metadir.config is None: print '%s: not a mw repo' % self.me sys.exit(1) + self.api_setup = False def _api_setup(self): - cookie_filename = os.path.join(self.metadir.location, 'cookies') - self.api_url = self.metadir.config.get('remote', 'api_url') - self.api = simplemediawiki.MediaWiki(self.api_url, + if not self.api_setup: # do not call _api_setup twice + cookie_filename = os.path.join(self.metadir.location, 'cookies') + self.api_url = self.metadir.config.get('remote', 'api_url') + self.api = simplemediawiki.MediaWiki(self.api_url, cookie_file=cookie_filename) + self.api_setup = True class InitCommand(CommandBase): From 6a4779c0f0092d4ef3fd71d5b85549a8e83879c7 Mon Sep 17 00:00:00 2001 From: esby Date: Thu, 21 Jun 2012 22:39:25 +0200 Subject: [PATCH 02/10] implemented query continue for pull category command - reworked the call to pull a bit --- src/mw/clicommands.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py index 4414898..f589202 100644 --- a/src/mw/clicommands.py +++ b/src/mw/clicommands.py @@ -132,6 +132,7 @@ def __init__(self): usage = '[options] PAGENAME ...' CommandBase.__init__(self, 'pull_commandat', 'add remote pages to repo ' 'belonging to the given category', usage) + self.query_continue = '' def _do_command(self): self._die_if_no_init() @@ -145,13 +146,28 @@ def _do_command(self): 'generator': 'categorymembers', 'gcmlimit': 500 } - response = self.api.call(data)['query']['pages'] + if self.query_continue != '': + data['gcmcontinue'] = self.query_continue + + api_call = self.api.call(data) + if 'query-continue' in api_call: + self.query_continue = api_call['query-continue']['categorymembers']['gcmcontinue'] + else: + self.query_continue = '' + response = api_call['query']['pages'] + pull_command = PullCommand() + pull_command.args = [] + for pageid in response.keys(): pagename = response[pageid]['title'] - print pagename - pull_command = PullCommand() - pull_command.args = [pagename.encode('utf-8')] - pull_command._do_command() + pull_command.args += [pagename.encode('utf-8')] + + pull_command._do_command() + + if self.query_continue != '': + print 'query continue detected - continuing the query' + self._do_command() + class PullCommand(CommandBase): From 75a052f2f70a1224c85e0955ac764c96199968af Mon Sep 17 00:00:00 2001 From: esby Date: Fri, 22 Jun 2012 00:54:59 +0200 Subject: [PATCH 03/10] pagedict is now loaded once per session, and still written down each time an element is added --- src/mw/metadir.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/mw/metadir.py b/src/mw/metadir.py index db2ff62..0a251e8 100644 --- a/src/mw/metadir.py +++ b/src/mw/metadir.py @@ -47,6 +47,15 @@ def __init__(self): self.config.read(self.config_loc) else: self.config = None + self.pagedict_loaded = False + + def pagedict_load(self): + if not self.pagedict_loaded: + print "loading pagedict" + fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+') + self.pagedict = json.loads(fd.read()) + fd.close + self.pagedict_loaded = True def save_config(self): with open(self.config_loc, 'wb') as config_file: @@ -89,20 +98,18 @@ def clean_page(self, pagename): fd.close() def pagedict_add(self, pagename, pageid, currentrv): - fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+') - pagedict = json.loads(fd.read()) - pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)} - fd.seek(0) - fd.write(json.dumps(pagedict)) + self.pagedict_load() + self.pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)} + fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w') + fd.write(json.dumps(self.pagedict)) fd.truncate() fd.close() def get_pageid_from_pagename(self, pagename): - fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r') - pagedict = json.loads(fd.read()) + self.pagedict_load() pagename = pagename.decode('utf-8') - if pagename in pagedict.keys(): - return pagedict[pagename] + if pagename in self.pagedict.keys(): + return self.pagedict[pagename] else: return None From 29b314574d9cf9694468b67fa4c128d605093d94 Mon Sep 17 00:00:00 2001 From: esby Date: Fri, 22 Jun 2012 10:52:09 +0200 Subject: [PATCH 04/10] removing a debug print --- src/mw/metadir.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mw/metadir.py b/src/mw/metadir.py index 0a251e8..1ee2b86 100644 --- a/src/mw/metadir.py +++ b/src/mw/metadir.py @@ -51,7 +51,6 @@ def __init__(self): def pagedict_load(self): if not self.pagedict_loaded: - print "loading pagedict" fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+') self.pagedict = json.loads(fd.read()) fd.close From 88ed257537cb3739fbda02a9bf1b7683d100053f Mon Sep 17 00:00:00 2001 From: esby Date: Sun, 24 Jun 2012 22:27:26 +0200 Subject: [PATCH 05/10] throttle take in account the request time fixed an issue in merge command with filename containing accentued characters --- src/mw/clicommands.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py index f589202..5b189b2 100644 --- a/src/mw/clicommands.py +++ b/src/mw/clicommands.py @@ -301,7 +301,7 @@ def _do_command(self): os.rename(full_filename, full_filename + '.local') # pull wiki copy pull_command = PullCommand() - pull_command.args = [pagename.encode('utf-8')] + pull_command.args = [pagename]#.encode('utf-8')] #assuming the file is already using utf-8 - esby pull_command._do_command() # mv remote to filename.wiki.remote os.rename(full_filename, full_filename + '.remote') @@ -315,7 +315,7 @@ def _do_command(self): os.remove(full_filename + '.remote') # mw ci pagename commit_command = CommitCommand() - commit_command.args = [pagename.encode('utf-8')] + commit_command.args = [pagename]#.encode('utf-8')] #assuming the file is already using utf-8 - esby commit_command._do_command() @@ -352,6 +352,7 @@ def _do_command(self): edit_summary = self.options.edit_summary for filename in status: if status[filename] in ['M']: + start_time = time.time() files_to_commit -= 1 # get edit token data = { @@ -424,8 +425,13 @@ def _do_command(self): data = data.encode('utf-8') fd.write(data) if files_to_commit : - print 'waiting 3s before processing the next file' - time.sleep(3) + end_time = time.time() + print time.strftime("%Y-%m-%d - %H:%M:%S", time.gmtime(time.time())) + time_inc = end_time - start_time + delay = 10 - time_inc + if delay > 0 : + print "adjusting throttle - waiting for %.2fs" % delay + time.sleep(delay) else: print 'error: committing %s failed: %s' % \ (filename, response['edit']['result']) From 5957d10fedffb256048fdebcbeb4b7331e49f81d Mon Sep 17 00:00:00 2001 From: esby Date: Sun, 24 Jun 2012 22:30:29 +0200 Subject: [PATCH 06/10] implemented alternate cache system - use_md5 = on in config file --- src/mw/metadir.py | 89 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/src/mw/metadir.py b/src/mw/metadir.py index 1ee2b86..18f9a45 100644 --- a/src/mw/metadir.py +++ b/src/mw/metadir.py @@ -23,7 +23,18 @@ import os from StringIO import StringIO import sys +import hashlib +# function taken from http://code.activestate.com/recipes/466341-guaranteed-conversion-to-unicode-or-byte-string/ +# used to handle some weird case with accentued characters in filename +# eg: http://commons.wikimedia.org/wiki/File:Jean-Louis_Debr%C3%A9_14_mars_2009.jpg +def safe_str(obj): + """ return the byte string representation of obj """ + try: + return str(obj) + except UnicodeEncodeError: + # obj is unicode + return unicode(obj).encode('unicode_escape') class Metadir(object): @@ -45,16 +56,15 @@ def __init__(self): os.path.isfile(self.config_loc): self.config = ConfigParser.RawConfigParser() self.config.read(self.config_loc) + self.use_md5 = False + if self.config.has_option('index', 'use_md5'): + self.use_md5 = ( self.config.get('index', 'use_md5') == 'on' ) + md5path = os.path.join(self.location, 'cache', 'md5index') + if self.use_md5 and not os.path.exists(md5path): + os.mkdir(md5path, 0755) + self.pagedict_loaded = False else: self.config = None - self.pagedict_loaded = False - - def pagedict_load(self): - if not self.pagedict_loaded: - fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+') - self.pagedict = json.loads(fd.read()) - fd.close - self.pagedict_loaded = True def save_config(self): with open(self.config_loc, 'wb') as config_file: @@ -77,6 +87,8 @@ def create(self, api_url): self.config.set('remote', 'api_url', api_url) self.config.add_section('merge') self.config.set('merge', 'tool', 'kidff3 %s %s -o %s') + self.config.add_section('index') + self.config.set('index''use_md5','on') self.save_config() # create cache/ os.mkdir(os.path.join(self.location, 'cache')) @@ -84,6 +96,11 @@ def create(self, api_url): fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w') fd.write(json.dumps({})) fd.close() + + # structure replacement for pagedict + # will also be created if use_md5 is turned on with an existing project + os.mkdir(os.path.join(self.location, 'cache', 'md5index'), 0755) + # create cache/pages/ os.mkdir(os.path.join(self.location, 'cache', 'pages'), 0755) @@ -96,21 +113,53 @@ def clean_page(self, pagename): fd.write(cur_content.encode('utf-8')) fd.close() + def pagedict_load(self): + if not self.pagedict_loaded: + fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+') + self.pagedict = json.loads(fd.read()) + fd.close + self.pagedict_loaded = True + + def get_md5_from_pagename(self, pagename): + m = hashlib.md5() + name = safe_str(pagename) + m.update(name) + return os.path.join(self.location, 'cache', 'md5index', m.hexdigest()) + def pagedict_add(self, pagename, pageid, currentrv): - self.pagedict_load() - self.pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)} - fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w') - fd.write(json.dumps(self.pagedict)) - fd.truncate() - fd.close() + if not self.use_md5: + self.pagedict_load() + self.pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)} + fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w') + fd.write(json.dumps(self.pagedict)) + fd.truncate() + fd.close() + else: # feeding the new index structure + md5pagename = self.get_md5_from_pagename(pagename) + page = {} + page[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)} + fd = file(md5pagename , 'w') + fd.write(json.dumps(page)) + fd.truncate() + fd.close() def get_pageid_from_pagename(self, pagename): - self.pagedict_load() - pagename = pagename.decode('utf-8') - if pagename in self.pagedict.keys(): - return self.pagedict[pagename] - else: - return None + if not self.use_md5: + self.pagedict_load() + pagename = pagename.decode('utf-8') + if pagename in self.pagedict.keys(): + return self.pagedict[pagename] + else: + return None + else: # feeding the new index structure + pagename = pagename.decode('utf-8') + md5pagename = self.get_md5_from_pagename(pagename) + if os.path.isfile(md5pagename): + fd = file(md5pagename, 'r+') + page = json.loads(fd.read()) + return page[pagename] + else: + return None def pages_add_rv(self, pageid, rv): pagefile = os.path.join(self.location, 'cache', 'pages', str(pageid)) From 97de8c581bd1032ecb5a7af0d47554d563f292e2 Mon Sep 17 00:00:00 2001 From: esby Date: Mon, 25 Jun 2012 18:00:03 +0200 Subject: [PATCH 07/10] remove the usage of safe_str --- src/mw/metadir.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/mw/metadir.py b/src/mw/metadir.py index 18f9a45..8db130a 100644 --- a/src/mw/metadir.py +++ b/src/mw/metadir.py @@ -25,16 +25,6 @@ import sys import hashlib -# function taken from http://code.activestate.com/recipes/466341-guaranteed-conversion-to-unicode-or-byte-string/ -# used to handle some weird case with accentued characters in filename -# eg: http://commons.wikimedia.org/wiki/File:Jean-Louis_Debr%C3%A9_14_mars_2009.jpg -def safe_str(obj): - """ return the byte string representation of obj """ - try: - return str(obj) - except UnicodeEncodeError: - # obj is unicode - return unicode(obj).encode('unicode_escape') class Metadir(object): @@ -122,7 +112,7 @@ def pagedict_load(self): def get_md5_from_pagename(self, pagename): m = hashlib.md5() - name = safe_str(pagename) + name = pagename.encode('unicode_escape') m.update(name) return os.path.join(self.location, 'cache', 'md5index', m.hexdigest()) From 7b3d70a274ef161874d40212a35532bf7b3c2fc8 Mon Sep 17 00:00:00 2001 From: esby Date: Tue, 26 Jun 2012 23:48:24 +0200 Subject: [PATCH 08/10] More verbose on committing files - printing date - name - number of files left --- src/mw/clicommands.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py index 5b189b2..765122e 100644 --- a/src/mw/clicommands.py +++ b/src/mw/clicommands.py @@ -426,12 +426,14 @@ def _do_command(self): fd.write(data) if files_to_commit : end_time = time.time() - print time.strftime("%Y-%m-%d - %H:%M:%S", time.gmtime(time.time())) + print time.strftime("%Y-%m-%d - %H:%M:%S", time.gmtime(time.time())) \ + + " - Committed - " + mw.metadir.filename_to_pagename(filename[:-5]) \ + + " - Files left: " + str(files_to_commit) time_inc = end_time - start_time delay = 10 - time_inc if delay > 0 : print "adjusting throttle - waiting for %.2fs" % delay - time.sleep(delay) + time.sleep(delay) else: print 'error: committing %s failed: %s' % \ (filename, response['edit']['result']) From 5bbd662af4bce418db58aa2ab0b5f89099f63b2a Mon Sep 17 00:00:00 2001 From: esby Date: Wed, 27 Jun 2012 17:13:55 +0200 Subject: [PATCH 09/10] missing comma in setting config for md5 algorithm --- src/mw/metadir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mw/metadir.py b/src/mw/metadir.py index 8db130a..6c4f29d 100644 --- a/src/mw/metadir.py +++ b/src/mw/metadir.py @@ -78,7 +78,7 @@ def create(self, api_url): self.config.add_section('merge') self.config.set('merge', 'tool', 'kidff3 %s %s -o %s') self.config.add_section('index') - self.config.set('index''use_md5','on') + self.config.set('index', 'use_md5','on') self.save_config() # create cache/ os.mkdir(os.path.join(self.location, 'cache')) From d4d96923b8572b7448def2da151011e237489a03 Mon Sep 17 00:00:00 2001 From: esby Date: Wed, 27 Jun 2012 18:04:07 +0200 Subject: [PATCH 10/10] handling case for pulling category members that have 0 members - no more error thrown --- src/mw/clicommands.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py index 765122e..fe245d4 100644 --- a/src/mw/clicommands.py +++ b/src/mw/clicommands.py @@ -154,19 +154,21 @@ def _do_command(self): self.query_continue = api_call['query-continue']['categorymembers']['gcmcontinue'] else: self.query_continue = '' - response = api_call['query']['pages'] - pull_command = PullCommand() - pull_command.args = [] + if api_call != [] : + + response = api_call['query']['pages'] + pull_command = PullCommand() + pull_command.args = [] - for pageid in response.keys(): - pagename = response[pageid]['title'] - pull_command.args += [pagename.encode('utf-8')] + for pageid in response.keys(): + pagename = response[pageid]['title'] + pull_command.args += [pagename.encode('utf-8')] - pull_command._do_command() + pull_command._do_command() - if self.query_continue != '': - print 'query continue detected - continuing the query' - self._do_command() + if self.query_continue != '': + print 'query continue detected - continuing the query' + self._do_command()