Notecase import script

jaap-karssenberg edited this page Oct 25, 2013 · 1 revision
Clone this wiki locally

file "notecase2zim.py":

#!/usr/bin/python

# Simple script to convert NoteCase Document to a Zim notebook folder
#
# NoteCase reference: 
#    http://notecase.sourceforge.net/  (Free version, discontinued)
#    http://www.virtual-sky.com/   (Pro version)
#
# Based on BeautifulSoup (you need to install it before running notecase2zim): 
#    http://www.crummy.com/software/BeautifulSoup/
#
# Adapted to my use of NoteCase and Zim => other may want to adapt it
# For instance:
#   Color "red" in NoteCase => I use "italic" in Zim
#   Background Color "grey" in NoteCase => Title 3 in Zim
#
# Usage :
# -------
# 1. Save NoteCase document to .ncd format (plain text, no compression)
# 2. This script assumes the name is "notecase.ncd". This can be changed below
# 3. Run: python notecase2zim.py
# 4. Get a Folder named "notecase.zim" with the main file "notebook.zim" inside
#
# v1.1
# Jigho 2011
# Contact: https://launchpad.net/~jigho
#

import os
import shutil
import sys
import re
import datetime
sys.path.append('./BeautifulSoup')

from BeautifulSoup import BeautifulSoup

notecasefile = 'notecase.ncd'

def create_file_zim():
    # You may change the name and endofline mode here
    fileZim = open('notebook.zim', 'w')
    fileZim.write('[Notebook]\nname=Notes\nversion=0.4\nendofline=dos')
    fileZim.close()

def process_title(titre, date):
    # Some titles are plain, but some have information that we do not use in Zim
    if (titre.span):
        m = titre.span.contents
        titre2 = str(m[1])
    elif (titre.string):
        titre2 = titre.string
    else:
        m = titre.contents
        titre2 = str(m[1])

    # Delete white space, / and " in the filename
    output1 = str(titre2 + '.txt').replace(' ', '_')
    output2 = output1.replace('/', '')
    output3 = output2.replace('\"', '')
    output = unicode(output3, 'utf-8', errors='ignore')

    # Some verbose, usefull on large contents
    # to be aware that the program is still processing...
    print 'Creating file: ', output

    fileOut = open(output, 'w')

    # Standard information at the start of any Zim file
    fileOut.write('Content-Type: text/x-zim-wiki\n')
    fileOut.write('Wiki-Format: zim 0.4\n')
    fileOut.write('Creation-Date: ' + str(date) + '\n')

    fileOut.write('\n====== ' + titre2 + ' ======\n')
    fileOut.write('\n')

    return fileOut

def create_subdir(repertoire):
    rep = repertoire.name.replace('.txt', '')
    os.mkdir(rep)
    os.chdir(rep)

def process_format(c, fichier, formatString):
    # for basic formatting tags (underline, bold, italic,...)
    # do the core job

    newLine = False

    # Open Wiki format
    fichier.write(formatString)

    # Another trick in case of formatted content ends with a newline
    # I then prefer to close the formatting tag and then write the
    # new line without formatting
    if (len(c.contents) > 1):
        if (c.contents[-2].__class__.__name__ == 'Tag'):
            if (c.contents[-2].name == 'br'):
                c.contents[-2].extract()
                c.contents[-1].extract()
                newLine = True

    # Process content (recursively !)
    process_content(c, fichier, formatString)

    # Close Wiki format
    fichier.write(formatString)

    # End of the trick for content finishing with a newline
    if newLine:
        fichier.write('\n')

def process_content(contenu, fichier, currentFormat):
    # "currentFormat" is a trick to close the Wiki format at end of each line
    # even if the format is applied to multi-lines
    # Nota: this trick would need to be be enhanced
    #       when multiple formats are nested

    for c in contenu:
        if (c.__class__.__name__ == 'Tag'):
            # <dl> tag stands for new note, ie new Zim file
            if c.name == 'dl':
                create_subdir(fichier)
                process_page(c)
                os.chdir('..')

            # <br> tag stands for new line
            # use the "currentFormat" trick to properly close format tag
            # and then reopen it on the the new line
            elif c.name == 'br':
                fichier.write(currentFormat)
                fichier.write('\n')
                fichier.write(currentFormat)

            # <u> tag stands for underline
            elif c.name == 'u':
                process_format(c, fichier, '__')

            # <b> tag stands for bold
            elif c.name == 'b':
                process_format(c, fichier, '**')

            # <i> tag stands fr italic
            elif c.name == 'i':
                process_format(c, fichier, '//')

            # <s> tag stands for strike-through
            elif c.name == 's':
                process_format(c, fichier, '~~')

            # <span> tag can have different purposes according to arguments
            elif c.name == 'span':
                # Color "red" in NoteCase => I use "italic" in Zim
                if (c['style'] == "color:#ff0000"):
                    process_format(c, fichier, '//')
                # Color "blue" in NoteCase => I use "bold" in Zim
                elif (c['style'] == "color:#0000ff"):
                    process_format(c, fichier, '**')
                # Color "green" in NoteCase => I use "bold" in Zim
                elif (c['style'] == "color:#00ff00"):
                    process_format(c, fichier, '**')
                # Background Color "grey" in NoteCase => Title 3 in Zim
                elif (c['style'] == "background-color:#bfbfbf"):
                    fichier.write('===== ')
                    # Don't not use the "currentFormat" trick,
                    # since title format is not symetrical
                    #(which add difficulty)
                    # and Zim seems to autoclose this format at the end of line
                    process_content(c, fichier, currentFormat)
                    currentFormat = ''
                # Other <span> contents are treated as plain text
                # You may add more cases according to your needs
                else:
                    print "WARNING : unknown SPAN type", c.attrs
                    process_content(c, fichier, currentFormat)

            # <p> tag is not taken into account
            elif c.name == 'p':
                process_content(c, fichier, currentFormat)

            # <a> tag stands for links
            elif c.name == 'a':
                fichier.write('[[')
                fichier.write(c['href'].encode('utf-8'))
                fichier.write('|')
                process_content(c, fichier, currentFormat)
                fichier.write(']]')

            # In case program encounter a Tag which is not dealt with
            # according to your needs, you can then add specific bloc
            else:    
                print 'WARNING, unknown tag: ', c.name
                fichier.write(
                    'TAG ' + c.name + ' / ' + c.string.encode("UTF-8"))

        else:
            ligne = c.string.encode("UTF-8")
            # Delete the new line symbol at start of the line
            # This happens when there was a <br> just before
            # but <br> is already taken into account
            fichier.write(re.sub("^\n", '', ligne))

def process_page(page):
    creation = datetime.date.today()
    for a in page.contents:
        if (a.__class__.__name__ == 'Tag'):
            if a.name == 'dt':
                fileOut = process_title(a, creation)
            elif a.name == 'dd':
                process_content(a.contents, fileOut, '')
        elif (a.__class__.__name__ == 'Comment'):
            m = re.match("<!--property:date_created=(.*)-->$", str(a))
            if (m):
                creation = datetime.date.fromtimestamp(float(m.group(1)))

def main(repertoire):
    xml = open(notecasefile, 'r').read()
    soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.XML_ENTITIES)
    level0 = soup.html.body.dl
    os.chdir(repertoire)
    create_file_zim()
    process_page(level0)

if __name__ == '__main__':
    zimdir = re.sub(".ncd$", ".zim", notecasefile)
    #shutil.rmtree(zimdir)
    os.mkdir(zimdir)
    main(zimdir)