Moinmoin import script

jaap-karssenberg edited this page Oct 25, 2013 · 1 revision
Clone this wiki locally

As mentioned in the comment of the script, this is a modification of Slim Gaillards moin2doku.py. I've removed some bugs, added the creation of a page header with correct timestamp and modified the formatting regexs. The file attachment conversion isn't removed, but it is neither tested nor adapted to ZIM. Feel free to make modifications or fixes. Just add than to this page.

By the way...

ZIM uses two different markups for verbatim/code blocks. verbatim text segments within the line are marked with '' while whole blocks are marked with '''! There is no test allowed after the opening '''! The script currently (0.3) handles only the first markup. The REGEX based parsing of the script doesn't allow any "deeper inspection". So it isn't easy to add a secure detection of code blocks (nested or mixed markups,..).

file "moin2zim.py":

#!/usr/bin/python
#
# moin2zim.py  --  Joerg Desch <joerg DOT desch AT googlemail DOT com>
#
# ... A modification of moin2doku.py by Slim Gaillard
#     (see http://www.dokuwiki.org/tips:moinmoin2doku)
#
# IMPORTANT NOTE: this script is more a quick hack than a real project!
#
# "moin2zim" is a script for converting MoinMoin version 1.3+ wiki data to the
# Zim format. It tries to convert all pages and is not intended to convert
# a single page. You have to call it with the name of the directory containing
# the MoinMoin pages as first parameter, and the output directory as second
# parameter.
#
# example: python moin2zim.py ./my-moin/data/pages/ ./zim-import/
#
# In ./zim/import/ are all pages with the zim formatting and the Zim page header.
# You should have on eye on the files and than copy them into the Zim notebook
# directory.
#
# The script doesn't do all the work. Some formattings aren't supported, others
# are not available in Zim. For now, I've only done the stuff I've needed to
# convert my moin wiki.
#
# Missing stuff:
#  * CamelCase links without [] are not detected as internal links.
#  * verbatim (multiline) blocks (not the marks inside a line) aren't detected.
#    The script still uses '' instead of '''!
#  * some unsupported stuff is still in DokuWiki syntax.
#  * currently only one moin icon is translated
#  * attachments are neither tested nor checked for compatibility.
#  * much other stuff I'm not aware of... ;-)
#
# version 0.3 (jd) first public release of my modifications.
#


import sys, os, os.path, re, pdb, time
from os import listdir
from os.path import isdir, basename

def check_dirs(moin_pages_dir, output_dir):
    if not isdir(moin_pages_dir):
        print >> sys.stderr, "MoinMoin pages directory doesn't exist!"
        sys.exit(1)

    if not isdir(output_dir):
        print >> sys.stderr, "Output directory doesn't exist!"
        sys.exit(1)


def get_path_names(moin_pages_dir):
    items = listdir(moin_pages_dir)
    pathnames = []
    for item in items:
        item = os.path.join(moin_pages_dir, item)
        if isdir(item):
            pathnames.append(item)
    return pathnames


def get_current_revision(page_dir):
    rev_dir = os.path.join(page_dir, 'revisions')
    if isdir(rev_dir):
        revisions = listdir(rev_dir)
        revisions.sort()
        return os.path.join(rev_dir, revisions[-1])
    return ''


def copy_attachments(page_dir, attachment_dir):
  dir = os.path.join(page_dir,'attachments')
  if isdir(dir):
    attachments = listdir(dir)
    #pdb.set_trace()
    for attachment in attachments:
      cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"'
      os.system ( cmd_string )


def convert_page(page, file):
    namespace = ':'
    for i in range(0, len(file) - 1):
      namespace += file[i] + ':'
    regexp = (
        ('\[\[TableOfContents.*\]\]', ''),          # remove
        ('\[\[BR\]\]$', ''),                        # newline at end of line - remove
        ('\[\[BR\]\]', '\n'),                       # newline
        ('#pragma section-numbers off', ''),        # remove
        ('^##.*?\\n', ''),                          # remove
        ('``', ''),                                 # remove
        ('\["', '[['),                              # internal link open
        ('"\]', ']]'),                              # internal link close
        #('\[:(.*):',  [[\\1]] '),                 # original internal link expressions
        #('\[\[(.*)/(.*)\]\]',  '[[\\1:\\2]]'),
        #('(\[\[.*\]\]).*\]', '\\1'),
#        ('\[(http.*) .*\]', '[[\\1]]'),        # web link
#        ('\[(http.*) (.*)\]', '[[\\1|\\2]]'),        # web link
        ('\[(http[^ ]*) ([^\]]*)\]', '[[\\1|\\2]]'),        # web link
#        ('\[(http.*)\]', '[[\\1]]'),                  # web link
        ('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
        ('^\s\s\s\s\*', '\t\t\t*'),
        ('^\s\s\s\*', '\t\t*'),
        ('^\s\s\*', '\t*'),
        ('^\s\*', '*'),                           # lists must have 2 whitespaces before the asterisk
        ('^\s\s\s\s1\.', '      -'),
        ('^\s\s1\.', '    -'),
        ('^\s1\.', '  -'),
        ('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'),           # heading 5
        ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'),         # heading 4
        ('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'),       # heading 3
        ('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'),     # heading 2
        ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'),    # heading 1
        ('=-', '='),
        ('/!\\\\', '**(!)**'),                        # attention icon
        ('\|{2}', '|'),                             # table separator
        ('\'{5}([^\']*)\'{5}', '**//\\1//**'),          # bold and italic
        ('\'{3}([^\']*)\'{3}', '**\\1**'),              # bold
        ('\'{2}([^\']*)\'{2}', '//\\1//'),              # italic
        ('\{{3}', '\'\''),                              # open code/verbatim line segment
        ('\}{3}', '\'\''),                              # close code/verbatim line segment
        ('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'),  # CamelCase, dont change if CamelCase is in InternalLink
        ('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'),  # Date value
        ('attachment:(.*)','{{'+namespace+'\\1|}}')
    )

    for i in range(len(page)):
        line = page[i]
        for item in regexp:
            line = re.sub(item[0], item[1], line)
        page[i] = line
    return page

def print_help():
    print "Usage: moin2zim.py <moinmoin pages directory> <output directory>"
    print "Convert MoinMoin pages to ZIM Wiki."
    sys.exit(0)

def print_parameter_error():
    print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
    sys.exit(1)

def fix_name( filename ):
#    filename = filename.lower()
    filename = filename.replace('(2d)', '-')          # hyphen
    filename = filename.replace('(20)', '_')          # space->underscore
    filename = filename.replace('(2e)', '_')          # decimal point->underscore
    filename = filename.replace('(29)', '_')          # )->underscore
    filename = filename.replace('(28)', '_')          # (->underscore
    filename = filename.replace('(2b)', '+')          #
    filename = filename.replace('(2b2b)', '++')       #
    filename = filename.replace('(2b2b2d)', '++-')    #
    filename = filename.replace('.', '_')             # decimal point->underscore
    filename = filename.replace('(2c20)', '_')        # comma + space->underscore
    filename = filename.replace('(2028)', '_')        # space + (->underscore
    filename = filename.replace('(2920)', '_')        # ) + space->underscore
    filename = filename.replace('(2220)', 'inch_')    # " + space->inch + underscore
    filename = filename.replace('(3a20)', '_')        # : + space->underscore
    filename = filename.replace('(202827)', '_')      # space+(+'->underscore
    filename = filename.replace('(2720)', '_')        # '+ space->underscore
    filename = filename.replace('(c39c)', 'Ue')       # umlaut
    filename = filename.replace('(c3bc)', 'ue')       # umlaut
    filename = filename.replace('(c384)', 'Ae')       # umlaut
    filename = filename.replace('(c3a4)', 'ae')       # umlaut
    filename = filename.replace('(c3b6)', 'oe')       # umlaut
    return filename

#
# "main" starts here
#
if len(sys.argv) > 1:
    if sys.argv[1] in ('-h', '--help'):
        print_help()
    elif len(sys.argv) > 2:
        moin_pages_dir = sys.argv[1]
        output_dir = sys.argv[2]
    else:
        print_parameter_error()
else:
    print_parameter_error()

check_dirs(moin_pages_dir, output_dir)

print 'Input dir is: %s.' % moin_pages_dir
print 'Output dir is: %s.' % output_dir

pathnames = get_path_names(moin_pages_dir)

for pathname in pathnames:
    #pdb.set_trace() # start debugging here

    curr_rev = get_current_revision( pathname )
    if not os.path.exists( curr_rev ) : continue

    page_name = basename(pathname)
    if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups

    curr_rev_desc = file(curr_rev, 'r')
    curr_rev_content = curr_rev_desc.readlines()
    curr_rev_desc.close()

    page_name = fix_name( page_name )

    split = page_name.split('(2f)') # namespaces

    count = len(split)

    dateiname = split[-1]

    dir = output_dir
    # changed from attachment_dir = output_dir + '../media/':
    attachment_dir = output_dir + 'media/'
    if not isdir (attachment_dir):
      os.mkdir(attachment_dir)

    if count == 1:
      dir += 'unsorted'
      if not isdir (dir):
        os.mkdir(dir)

      attachment_dir += 'unsorted/'
      if not isdir (attachment_dir):
        os.mkdir(attachment_dir)

    for i in range(0, count - 1):

      dir += split[i] + '/'
      if not isdir (dir):
        os.mkdir(dir)

      attachment_dir += split[i] + '/'
      if not isdir (attachment_dir):
        os.mkdir(attachment_dir)

    if count == 1:
      str = 'unsorted/' + page_name
      split = str.split('/')
      curr_rev_content = convert_page(curr_rev_content, split)
    else:
      curr_rev_content = convert_page(curr_rev_content, split)

    # open the file and add the ZIM header
    ts=time.strftime("%Y-%m-%dT%H:%M:%S.0",time.localtime())
    out_file = os.path.join(dir, dateiname + '.txt')
    out_desc = file(out_file, 'w')
    out_desc.write('Content-Type: text/x-zim-wiki\nWiki-Format: zim 0.4\nCreation-Date: ')
    out_desc.write(ts)
    out_desc.write('\n\n')

    #write the content of the page
    out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
    out_desc.close()

    # pdb.set_trace() # start debugging here
    copy_attachments(pathname, attachment_dir)