Merge pull request #16 from esby/master

iliana · Jun 28, 2012 · bdcbb38 · bdcbb38
2 parents d80faea + d4d9692
commit bdcbb38
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 26 deletions.
diff --git a/src/mw/clicommands.py b/src/mw/clicommands.py
@@ -77,12 +77,15 @@ def _die_if_no_init(self):
         if self.metadir.config is None:
             print '%s: not a mw repo' % self.me
             sys.exit(1)
+        self.api_setup = False
 
     def _api_setup(self):
-        cookie_filename = os.path.join(self.metadir.location, 'cookies')
-        self.api_url = self.metadir.config.get('remote', 'api_url')
-        self.api = simplemediawiki.MediaWiki(self.api_url,
+        if not self.api_setup: # do not call _api_setup twice
+            cookie_filename = os.path.join(self.metadir.location, 'cookies')
+            self.api_url = self.metadir.config.get('remote', 'api_url')
+            self.api = simplemediawiki.MediaWiki(self.api_url,
                                              cookie_file=cookie_filename)
+            self.api_setup = True
 
 
 class InitCommand(CommandBase):
@@ -129,6 +132,7 @@ def __init__(self):
         usage = '[options] PAGENAME ...'
         CommandBase.__init__(self, 'pull_commandat', 'add remote pages to repo '
                              'belonging to the given category', usage)
+        self.query_continue = ''
 
     def _do_command(self):
         self._die_if_no_init()
@@ -142,14 +146,31 @@ def _do_command(self):
                 'generator': 'categorymembers',
                 'gcmlimit': 500
             }
-        response = self.api.call(data)['query']['pages']
-        for pageid in response.keys():
-            pagename = response[pageid]['title']
-            print pagename
+        if self.query_continue != '':
+           data['gcmcontinue'] = self.query_continue
+
+        api_call = self.api.call(data)
+        if 'query-continue' in api_call:
+            self.query_continue = api_call['query-continue']['categorymembers']['gcmcontinue']
+        else:
+            self.query_continue = ''
+        if api_call != [] :
+
+            response = api_call['query']['pages']
             pull_command = PullCommand()
-            pull_command.args = [pagename.encode('utf-8')]
+            pull_command.args = []
+
+            for pageid in response.keys():
+                 pagename = response[pageid]['title']
+                 pull_command.args += [pagename.encode('utf-8')]
+
             pull_command._do_command()
 
+            if self.query_continue != '':
+                 print 'query continue detected - continuing the query'
+                 self._do_command()
+
+
 
 class PullCommand(CommandBase):
 
@@ -283,7 +304,7 @@ def _do_command(self):
                 os.rename(full_filename, full_filename + '.local')
                 # pull wiki copy
                 pull_command = PullCommand()
-                pull_command.args = [pagename.encode('utf-8')]
+                pull_command.args = [pagename]#.encode('utf-8')] #assuming the file is already using utf-8 - esby
                 pull_command._do_command()
                 # mv remote to filename.wiki.remote
                 os.rename(full_filename, full_filename + '.remote')
@@ -297,7 +318,7 @@ def _do_command(self):
                 os.remove(full_filename + '.remote')
                 # mw ci pagename
                 commit_command = CommitCommand()
-                commit_command.args = [pagename.encode('utf-8')]
+                commit_command.args = [pagename]#.encode('utf-8')] #assuming the file is already using utf-8 - esby
                 commit_command._do_command()
 
 
@@ -334,6 +355,7 @@ def _do_command(self):
             edit_summary = self.options.edit_summary
         for filename in status:
             if status[filename] in ['M']:
+                start_time = time.time()
                 files_to_commit -= 1
                 # get edit token
                 data = {
@@ -406,8 +428,15 @@ def _do_command(self):
                         data = data.encode('utf-8')
                         fd.write(data)
                     if files_to_commit :
-                        print 'waiting 3s before processing the next file'
-                        time.sleep(3)
+                        end_time = time.time()
+                        print time.strftime("%Y-%m-%d - %H:%M:%S", time.gmtime(time.time())) \
+                            + " - Committed - " + mw.metadir.filename_to_pagename(filename[:-5]) \
+                            + " - Files left: " + str(files_to_commit)
+                        time_inc = end_time - start_time
+                        delay = 10 - time_inc
+                        if delay > 0 :
+                            print "adjusting throttle - waiting for %.2fs" % delay
+                            time.sleep(delay) 
                 else:
                     print 'error: committing %s failed: %s' % \
                             (filename, response['edit']['result'])
diff --git a/src/mw/metadir.py b/src/mw/metadir.py
@@ -23,6 +23,7 @@
 import os
 from StringIO import StringIO
 import sys
+import hashlib
 
 
 class Metadir(object):
@@ -45,6 +46,13 @@ def __init__(self):
            os.path.isfile(self.config_loc):
             self.config = ConfigParser.RawConfigParser()
             self.config.read(self.config_loc)
+            self.use_md5 = False
+            if self.config.has_option('index', 'use_md5'):
+                self.use_md5 = ( self.config.get('index', 'use_md5') == 'on' )
+                md5path = os.path.join(self.location, 'cache', 'md5index')
+                if self.use_md5 and not  os.path.exists(md5path):
+                    os.mkdir(md5path, 0755)
+            self.pagedict_loaded = False
         else:
             self.config = None
 
@@ -69,13 +77,20 @@ def create(self, api_url):
         self.config.set('remote', 'api_url', api_url)
         self.config.add_section('merge')
         self.config.set('merge', 'tool', 'kidff3 %s %s -o %s')
+        self.config.add_section('index')
+        self.config.set('index', 'use_md5','on')
         self.save_config()
         # create cache/
         os.mkdir(os.path.join(self.location, 'cache'))
         # create cache/pagedict
         fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w')
         fd.write(json.dumps({}))
         fd.close()
+
+        # structure replacement for pagedict
+        # will also be created if use_md5 is turned on with an existing project
+        os.mkdir(os.path.join(self.location, 'cache', 'md5index'), 0755)
+
         # create cache/pages/
         os.mkdir(os.path.join(self.location, 'cache', 'pages'), 0755)
 
@@ -88,23 +103,53 @@ def clean_page(self, pagename):
         fd.write(cur_content.encode('utf-8'))
         fd.close()
 
+    def pagedict_load(self):
+        if not self.pagedict_loaded:
+            fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+')
+            self.pagedict = json.loads(fd.read())
+            fd.close
+            self.pagedict_loaded = True
+
+    def get_md5_from_pagename(self, pagename):
+        m = hashlib.md5()
+        name = pagename.encode('unicode_escape') 
+        m.update(name)
+        return os.path.join(self.location, 'cache', 'md5index', m.hexdigest())
+
     def pagedict_add(self, pagename, pageid, currentrv):
-        fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r+')
-        pagedict = json.loads(fd.read())
-        pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)}
-        fd.seek(0)
-        fd.write(json.dumps(pagedict))
-        fd.truncate()
-        fd.close()
+        if not self.use_md5:
+            self.pagedict_load()
+            self.pagedict[pagename] = {'id': int(pageid), 'currentrv': int(currentrv)}
+            fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'w')
+            fd.write(json.dumps(self.pagedict))
+            fd.truncate()
+            fd.close()
+        else: # feeding the new index structure
+             md5pagename = self.get_md5_from_pagename(pagename)
+             page = {}
+             page[pagename] =  {'id': int(pageid), 'currentrv': int(currentrv)}
+             fd = file(md5pagename , 'w')
+             fd.write(json.dumps(page))
+             fd.truncate()
+             fd.close()
 
     def get_pageid_from_pagename(self, pagename):
-        fd = file(os.path.join(self.location, 'cache', 'pagedict'), 'r')
-        pagedict = json.loads(fd.read())
-        pagename = pagename.decode('utf-8')
-        if pagename in pagedict.keys():
-            return pagedict[pagename]
-        else:
-            return None
+        if not self.use_md5:
+             self.pagedict_load()
+             pagename = pagename.decode('utf-8')
+             if pagename in self.pagedict.keys():
+                 return self.pagedict[pagename]
+             else:
+                 return None
+        else: # feeding the new index structure
+             pagename = pagename.decode('utf-8')
+             md5pagename = self.get_md5_from_pagename(pagename)
+             if os.path.isfile(md5pagename):
+                 fd = file(md5pagename, 'r+')
+                 page = json.loads(fd.read())
+                 return page[pagename]
+             else:
+                 return None
 
     def pages_add_rv(self, pageid, rv):
         pagefile = os.path.join(self.location, 'cache', 'pages', str(pageid))