Softnote import script

jaap-karssenberg edited this page Oct 25, 2013 · 1 revision
Clone this wiki locally

The following script can be used to convert XML exported from softnote into a zim notebook. It does not support all special characters in the RTF data generated by softnote, so some manual editing of the XML may be required for the script to run successfully.

file "softnote2zim.py":

#!/usr/bin/python

# -*- coding: utf-8 -*-

# Copyright 2011 Jaap Karssenberg <pardus@cpan.org>

# Simple script to convert softnote XML to a zim notebook folder
# Writen as a quick hack, so quality of results may vary

# This script needs pyth, see http://pypi.python.org/pypi/pyth/

# TODO:
# * Looks like we loose strike formatting - blame pyht, other parser available for rtf ?
# * Nested formatting not supported by zim, but we output it anyway


import os
import sys
import re
sys.path.append('./pyth-0.5.6/')

from xml.etree import ElementTree
from StringIO import StringIO

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter

from zim.fs import Dir, TmpFile
from zim.notebook import Notebook, Path

import zim.stores.xml
import zim.stores.files


def parse(data):
    '''Converts softnote xml to xml representing a zim notebook'''
    # Pre-parse invalid XML
    # exploit the fact that softnote XML is nicely line based
    # per tag and tags do not contain any '&'
    xml = ''
    for line in data.splitlines():
        if line.startswith('<?xml'):
            xml += line + '\n'
            continue

        line = line.replace('&', '&amp;')
        match = re.match('^<(/?\w+)>', line) # match xml tag at start of line
        if match:
            tag = match.group(1)
            l = len(tag) + 2
            start = line[:l]
            line = line[l:]
            match = re.search('</%s>$' % tag, line)
            if match: # also tag on end of line
                end = line[-l-1:]
                line = line[:-l-1]
            else:
                end = ''
        else:
            start = ''
            end = ''

        xml += start + line.replace('<', '&lt;').replace('>', '&gt;') + end + '\n'

    # Parse XML
    #~ open('intermediate.xml', 'w').write(xml) # DEBUG
    tree = ElementTree.fromstring(xml)
    notebook = ElementTree.Element('section')

    categories = {} # top level folders by name
    pages = {} # pages by id

    for xrecord in tree.findall('XRECORDDATA'):
        #~ print 'FOUND:', map(xrecord.findtext, ('XCATALOG', 'XSUBJECT', 'XID', 'XPARENT'))

        parentid = xrecord.findtext('XPARENT')
        if parentid == '0':
            # we found a top node within category
            category = xrecord.findtext('XCATALOG')
            if not category in categories:
                # first time we see this category
                el = ElementTree.Element('page', {'name': category})
                el.tail = '\n'
                notebook.append(el)
                categories[category] = el
            parent = categories[category]
        else:
            # some sub-note
            assert parentid in pages, 'Found sub-note before parent :('
            parent = pages[parentid]

        title = xrecord.findtext('XSUBJECT')
        name = title.replace(':', ' ') # will confuse hierarchy
        name = Notebook.cleanup_pathname(title, purge=True) # make a valid name
        el = ElementTree.Element('page', {'name': name})
        el.tail = '\n'
        parent.append(el)

        id = xrecord.findtext('XID')
        pages[id] = el

        el.text = convert_rtf(xrecord.findtext('XBODY'))

    return ElementTree.tostring(notebook)

def convert_rtf(rtf):
    '''Converts rtf to zim wiki text'''
    print "DECODING >>>\n", rtf, '<<<\n'
    doc = Rtf15Reader.read(StringIO(rtf))
    html = XHTMLWriter.write(doc, pretty=True).read()
    return convert_html(html)

def convert_html(html):
    '''Converts html to zim wiki text'''
    #~ print "GOT HTML:\n", html
    tree = ElementTree.fromstring(html)
    text = _serialize_html(tree)
    #~ print "MADE TEXT:\n", text
    return text

def _serialize_html(tree):
    text = tree.text or ''
    for el in tree:
        if el.tag == 'strong':
            text += "**" + _serialize_html(el) + "**"
        elif el.tag == 'em':
            text += "//" + _serialize_html(el) + "//"
        elif el.tag == 'u':
            text += "__" + _serialize_html(el) + "__"
        elif el.tag == 'strike':
            text += "~~" + _serialize_html(el) + "~~"
        else:
            text += _serialize_html(el)
        text += el.tail or ''
    return text


def dump(xml, folder):
    '''Takes zim notebook in XML format and dump to file structure'''
    sourcefile = TmpFile('softnote2zim-tmp')
    sourcefile.write(xml)
    source = zim.stores.xml.Store(FakeNotebook(), Path(':'), file=sourcefile)

    target = zim.stores.files.Store(FakeNotebook(), Path(':'), dir=Dir(folder))

    for s_page in source.walk():
        text = source.get_node(s_page).text
        #~ print 'PAGE:', s_page.name
        #~ print text

        t_page = target.get_page(s_page)
        assert not t_page.source.exists(), 'Don\'t want to overwrite %s' % t_page.source.path
        print 'Writing:', t_page.source.path
        t_page.source.write(text)


class FakeNotebook(object):

    if os.name == 'nt': # Windows
        endofline = 'dos'
    else:
        endofline = 'unix'



if __name__ == '__main__':
    if len(sys.argv) == 3:
        input = sys.argv[1]
        xml = parse(open(input).read())
        #~ print xml
        dump(xml, sys.argv[2])
    else:
        print 'Usage: softnote2zim.py SOFTNOTE_XML OUTPUT_FOLDER'
        print 'output folder should be a new empty folder'