Convert DokuWiki Files to Publii Posts
===============================

This is a quick-and-dirty notebook for reading in a bunch of text files and inserting them into a Publii database as posts.

3rd party dependencies:
 - pandoc
 - tqdm
 - pyexiv2
 

In [None]:
from datetime import datetime, timedelta
import os.path
import os
from pathlib import Path, PurePath
import pandoc
from PIL import Image
import pyexiv2 as exiv
import re
import sqlite3
import shutil
from tqdm.notebook import tqdm

In [None]:
"""
Iterate through a folder of DokuWiki text files and:
  1) Convert to regular HTML with pandoc
  2) If there's an image gallery, build up an HTML string for it
"""

def build_gallery3(galname, basedir):
    """
    Build html gallery from a list of files. Only builds the html, doesn't do any
    thumbnail creation.
    """
    pre = '<div class="gallery" contenteditable="false" data-is-empty="false" data-translation="Add images" data-columns="4">'
    post = '</div>'
    galpath = Path(os.path.join(basedir, galname))
    image_list = []
    files = list(galpath.glob("*.png")) + list(galpath.glob("*.jpg")) + list(galpath.glob("*.jpeg")) + list(galpath.glob("*.gif"))
    html = pre
    for file in files:        
        caption = file.name
        img = exiv.Image(os.path.join(galpath, file.name), encoding='utf-8')
        if img:
            iptc = img.read_iptc()
            if 'Iptc.Application2.Caption' in iptc:
                caption = iptc['Iptc.Application2.Caption']                
            elif 'Iptc.Application2.Headline' in iptc:
                caption = iptc['Iptc.Application2.Headline']
        else:
            raise Exception(f"exiv2 library couldn't get data from {file}")
            
        img.close()
        img = Image.open(os.path.join(galpath, file.name))
        width, height = img.size
        html += f'<figure class="gallery__item"><a href="#DOMAIN_NAME#gallery/{file.name}" data-size="{width}x{height}"><img src="#DOMAIN_NAME#gallery/{file.stem}-thumbnail{file.suffix}" alt="{caption}" width="480" height="480"></a><figcaption>{caption}</figcaption></figure>'
        
    html += post
    return (html, files)


def process_wiki_file(filepath):
    """
    Return a dict with content for publii
    """
    gallery = None
    gallery_html = ""
    image_list = None
    text = ''
    # Extract the date from the file name
    slug = filepath.stem
    date = datetime.strptime(filepath.name[:10], '%Y-%m-%d') + timedelta(hours=15)
    date = int(date.timestamp() * 1000)
    fp = open(filepath, 'r', encoding='utf-8')
    # Remove the leading & trailing header characters (======) from the first line to get title
    title = fp.readline()[7:][:-8]
    # Read the rest of the post, skipping the empty newline
    lines = fp.readlines()[1:]
    fp.close()
    # Filter text for:
    #  - Galleries (like {{gallery}} tag)
    #  - An empty subhead, used as dokuwiki's "read more" feature
    for line in lines:
        match = re.search(r'(\{\{gallery\>\:blog\:(.*)\?\&unite\}\})', line)
        if match:            
            gallery = match.groups()[1]
        else:
            if line == '===== =====\n':
                line = '---READMORE---'
            text += line
    doc = pandoc.read(text, format='dokuwiki')
    pub = pandoc.write(doc, format='html')
    pub = re.sub(r'---READMORE---', '<hr id="read-more" data-translation="Read more">', pub)
    if gallery:
        gallery_html, image_list = build_gallery3(gallery, r'd:\oldsitepics')
        
    return {'date': date,
            'title': title,
            'slug': slug,
            'doc': doc,
            'pub': pub + gallery_html,
            'images': image_list}


text_folder = r'd:\oldsite'
gallery_folder = r'd:\oldsitepics'
posts = []
p = Path(text_folder)

for f in tqdm(list(p.glob("*.txt"))):
    posts.append(process_wiki_file(f))
print("Done")

In [None]:
"""
Importing posts into database
"""

# Path to Publii site's input\media\posts
postsfolder = r'C:\Users\USERNAME\Sync\publii\sites\woltmancom\input\media\posts'
if not Path(postsfolder).is_dir():
    raise Exception(f"{postsfolder} doesn't exist or is not a folder")
# Publii database
dbpath = r"C:\Users\USERNAME\Sync\publii\sites\woltmancom\input\playground.sqlite"
con = sqlite3.connect(dbpath)
cur = con.cursor()

post_sql = """
  INSERT INTO posts
  (title, authors, slug, text, created_at, modified_at, status, template)
  VALUES (?, ?, ?, ?, ?, ?, ?, ?)
  """

# Used for the posts_additional_data table.
additional_sql = """
  INSERT INTO posts_additional_data
  (post_id, key, value)
  VALUES (?, ?, ?)
  """
core_json = """{"metaTitle":"","metaDesc":"","metaRobots":"index, follow","canonicalUrl":"","mainTag":"","editor":"tinymce"}"""
view_json = """{"displayDate":{"type":"select"},"displayAuthor":{"type":"select"},"displayLastUpdatedDate":{"type":"select"},"displayTags":{"type":"select"},"displayShareButtons":{"type":"select"},"displayAuthorBio":{"type":"select"},"displayPostNavigation":{"type":"select"},"displayRelatedPosts":{"type":"select"},"displayComments":{"type":"select"}}"""

process_images = True
for p in tqdm(posts, position=0, desc="post", leave=True, colour='green'):
    cur.execute(post_sql, (p['title'], 1, p['slug'],
                 p['pub'], p['date'], p['date'],
                 'published', ''));
    pid = cur.lastrowid
    cur.execute(additional_sql, (pid, '_core', core_json))
    cur.execute(additional_sql, (pid, 'postViewSettings', view_json))
    if p['images'] and process_images:        
        folder = os.path.join(postsfolder, str(pid), 'gallery')
        os.makedirs(folder, exist_ok=True)
        for i in tqdm(p['images'], position=1, desc='thumbs', leave=False, colour='red'):
            dst = os.path.join(folder, i.name)
            shutil.copy(i, dst)
            thumbpath = os.path.join(folder, f'{i.stem}-thumbnail{i.suffix}')            
            img = Image.open(i)
            img.thumbnail((480,480), Image.Resampling.LANCZOS)            
            img.save(thumbpath)

con.commit()
con.close()

In [None]:
"""
Barebones support for just opening a text file, converting it, and copying it to the clipboard
"""
import pandoc
import pyperclip
doc = pandoc.read(open(r"D:\dokuwiki.txt", 'r', encoding='utf-8').read(), format='dokuwiki')
pub = pandoc.write(doc, format='html')
pyperclip.copy(pub)