setup.py: Add MO translation support

jasperproject · Jan 4, 2016 · 700932d · 700932d
1 parent d8fb52c
commit 700932d
Show file tree

Hide file tree

Showing 2 changed files with 336 additions and 0 deletions.
diff --git a/msgfmt.py b/msgfmt.py
@@ -0,0 +1,287 @@
+#! /usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+# Written by Martin v. Loewis <loewis@informatik.hu-berlin.de>
+#
+# Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless
+# translation service (PTS) of Zope
+#
+# Fixed some bugs and updated to support msgctxt
+# by Hanno Schlichting <hanno@hannosch.eu>
+
+"""Generate binary message catalog from textual translation description.
+
+This program converts a textual Uniforum-style message catalog (.po file) into
+a binary GNU catalog (.mo file). This is essentially the same function as the
+GNU msgfmt program, however, it is a simpler implementation.
+
+This file was taken from Python-2.3.2/Tools/i18n and altered in several ways.
+Now you can simply use it from another python module:
+
+  from msgfmt import Msgfmt
+  mo = Msgfmt(po).get()
+
+where po is path to a po file as string, an opened po file ready for reading or
+a list of strings (readlines of a po file) and mo is the compiled mo file as
+binary string.
+
+Exceptions:
+
+  * IOError if the file couldn't be read
+
+  * msgfmt.PoSyntaxError if the po file has syntax errors
+"""
+
+import array
+from ast import literal_eval
+import codecs
+from email.parser import HeaderParser
+import struct
+import sys
+
+PY3 = sys.version_info[0] == 3
+if PY3:
+    def b(s):
+        return s.encode("latin-1")
+
+    def u(s, enc=None):
+        return s
+
+    def header_charset(s):
+        p = HeaderParser()
+        return p.parsestr(s).get_content_charset()
+
+    import io
+    BytesIO = io.BytesIO
+    FILE_TYPE = io.IOBase
+else:
+    def b(s):
+        return s
+
+    def u(s, enc="unicode_escape"):
+        return unicode(s, enc)
+
+    def header_charset(s):
+        p = HeaderParser()
+        return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset()
+
+    from cStringIO import StringIO as BytesIO
+    FILE_TYPE = file
+
+
+class PoSyntaxError(Exception):
+    """ Syntax error in a po file """
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str__(self):
+        return 'Po file syntax error: %s' % self.msg
+
+
+class Msgfmt:
+
+    def __init__(self, po, name='unknown'):
+        self.po = po
+        self.name = name
+        self.messages = {}
+        self.openfile = False
+        # Start off assuming latin-1, so everything decodes without failure,
+        # until we know the exact encoding
+        self.encoding = 'latin-1'
+
+    def readPoData(self):
+        """ read po data from self.po and return an iterator """
+        output = []
+        if isinstance(self.po, str):
+            output = open(self.po, 'rb')
+        elif isinstance(self.po, FILE_TYPE):
+            self.po.seek(0)
+            self.openfile = True
+            output = self.po
+        elif isinstance(self.po, list):
+            output = self.po
+        if not output:
+            raise ValueError("self.po is invalid! %s" % type(self.po))
+        if isinstance(output, FILE_TYPE):
+            # remove BOM from the start of the parsed input
+            first = output.readline()
+            if len(first) == 0:
+                return output.readlines()
+            if first.startswith(codecs.BOM_UTF8):
+                first = first.lstrip(codecs.BOM_UTF8)
+            return [first] + output.readlines()
+        return output
+
+    def add(self, context, id, string, fuzzy):
+        "Add a non-empty and non-fuzzy translation to the dictionary."
+        if string and not fuzzy:
+            # The context is put before the id and separated by a EOT char.
+            if context:
+                id = context + u('\x04') + id
+            if not id:
+                # See whether there is an encoding declaration
+                charset = header_charset(string)
+                if charset:
+                    # decode header in proper encoding
+                    string = string.encode(self.encoding).decode(charset)
+                    if not PY3:
+                        # undo damage done by literal_eval in Python 2.x
+                        string = string.encode(self.encoding).decode(charset)
+                    self.encoding = charset
+            self.messages[id] = string
+
+    def generate(self):
+        "Return the generated output."
+        # the keys are sorted in the .mo file
+        keys = sorted(self.messages.keys())
+        offsets = []
+        ids = strs = b('')
+        for id in keys:
+            msg = self.messages[id].encode(self.encoding)
+            id = id.encode(self.encoding)
+            # For each string, we need size and file offset. Each string is
+            # NUL terminated; the NUL does not count into the size.
+            offsets.append((len(ids), len(id), len(strs),
+                            len(msg)))
+            ids += id + b('\0')
+            strs += msg + b('\0')
+        output = b('')
+        # The header is 7 32-bit unsigned integers. We don't use hash tables,
+        # so the keys start right after the index tables.
+        keystart = 7 * 4 + 16 * len(keys)
+        # and the values start after the keys
+        valuestart = keystart + len(ids)
+        koffsets = []
+        voffsets = []
+        # The string table first has the list of keys, then the list of values.
+        # Each entry has first the size of the string, then the file offset.
+        for o1, l1, o2, l2 in offsets:
+            koffsets += [l1, o1 + keystart]
+            voffsets += [l2, o2 + valuestart]
+        offsets = koffsets + voffsets
+        # Even though we don't use a hashtable, we still set its offset to be
+        # binary compatible with the gnu gettext format produced by:
+        # msgfmt file.po --no-hash
+        output = struct.pack("Iiiiiii",
+                             0x950412de,        # Magic
+                             0,                 # Version
+                             len(keys),         # # of entries
+                             7 * 4,             # start of key index
+                             7 * 4 + len(keys) * 8,  # start of value index
+                             0, keystart)       # size and offset of hash table
+        if PY3:
+            output += array.array("i", offsets).tobytes()
+        else:
+            output += array.array("i", offsets).tostring()
+        output += ids
+        output += strs
+        return output
+
+    def get(self):
+        """ """
+        self.read()
+        # Compute output
+        return self.generate()
+
+    def read(self, header_only=False):
+        """ """
+        ID = 1
+        STR = 2
+        CTXT = 3
+
+        section = None
+        fuzzy = 0
+        msgid = msgstr = msgctxt = u('')
+
+        # Parse the catalog
+        lno = 0
+        for l in self.readPoData():
+            l = l.decode(self.encoding)
+            lno += 1
+            # If we get a comment line after a msgstr or a line starting with
+            # msgid or msgctxt, this is a new entry
+            if section == STR and (l[0] == '#' or (l[0] == 'm' and
+               (l.startswith('msgctxt') or l.startswith('msgid')))):
+                self.add(msgctxt, msgid, msgstr, fuzzy)
+                section = None
+                fuzzy = 0
+                # If we only want the header we stop after the first message
+                if header_only:
+                    break
+            # Record a fuzzy mark
+            if l[:2] == '#,' and 'fuzzy' in l:
+                fuzzy = 1
+            # Skip comments
+            if l[0] == '#':
+                continue
+            # Now we are in a msgctxt section
+            if l.startswith('msgctxt'):
+                section = CTXT
+                l = l[7:]
+                msgctxt = u('')
+            # Now we are in a msgid section, output previous section
+            elif (l.startswith('msgid') and
+                  not l.startswith('msgid_plural')):
+                if section == STR:
+                    self.add(msgid, msgstr, fuzzy)
+                section = ID
+                l = l[5:]
+                msgid = msgstr = u('')
+                is_plural = False
+            # This is a message with plural forms
+            elif l.startswith('msgid_plural'):
+                if section != ID:
+                    raise PoSyntaxError('msgid_plural not preceeded by '
+                        'msgid on line %d of po file %s' %
+                        (lno, repr(self.name)))
+                l = l[12:]
+                msgid += b('\0')  # separator of singular and plural
+                is_plural = True
+            # Now we are in a msgstr section
+            elif l.startswith('msgstr'):
+                section = STR
+                if l.startswith('msgstr['):
+                    if not is_plural:
+                        raise PoSyntaxError('plural without msgid_plural '
+                            'on line %d of po file %s' %
+                            (lno, repr(self.name)))
+                    l = l.split(']', 1)[1]
+                    if msgstr:
+                        # Separator of the various plural forms
+                        msgstr += b('\0')
+                else:
+                    if is_plural:
+                        raise PoSyntaxError('indexed msgstr required for '
+                            'plural on line %d of po file %s' %
+                            (lno, repr(self.name)))
+                    l = l[6:]
+            # Skip empty lines
+            l = l.strip()
+            if not l:
+                continue
+            # TODO: Does this always follow Python escape semantics?
+            try:
+                l = literal_eval(l)
+            except Exception as msg:
+                raise PoSyntaxError('%s (line %d of po file %s): \n%s' %
+                    (msg, lno, repr(self.name), l))
+            l = u(l, self.encoding)
+            if section == CTXT:
+                msgctxt += l
+            elif section == ID:
+                msgid += l
+            elif section == STR:
+                msgstr += l
+            else:
+                raise PoSyntaxError('error on line %d of po file %s' %
+                    (lno, repr(self.name)))
+
+        # Add last entry
+        if section == STR:
+            self.add(msgctxt, msgid, msgstr, fuzzy)
+
+        if self.openfile:
+            self.po.close()
+
+    def getAsFile(self):
+        return BytesIO(self.get())
diff --git a/setup.py b/setup.py
@@ -1,10 +1,51 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
+import os
+import msgfmt
 import setuptools
+from setuptools.command.bdist_egg import bdist_egg
+from distutils.command.build import build
 
 APPNAME = 'jasper'
 
 
+class jasper_bdist_egg(bdist_egg):
+    def run(self):
+        self.run_command('build_i18n')
+        setuptools.command.bdist_egg.bdist_egg.run(self)
+
+
+class jasper_build_i18n(setuptools.Command):
+    description = 'compile PO translations to MO files'
+
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        for root, _, filenames in os.walk(os.path.dirname(__file__)):
+            for po_filename in filenames:
+                filename, ext = os.path.splitext(po_filename)
+                if ext != '.po':
+                    continue
+                path = os.path.join(root, filename)
+                po_path = os.extsep.join([path, 'po'])
+                mo_path = os.extsep.join([path, 'mo'])
+                print('compile %s -> %s' % (po_path, mo_path))
+                with open(mo_path, 'wb') as f:
+                    f.write(msgfmt.Msgfmt(po_path).get())
+
+
+class jasper_build(build):
+    sub_commands = build.sub_commands + [
+        ('build_i18n', None)
+    ]
+
+
 setuptools.setup(
     name=APPNAME,
     version='2.0a1.dev1',
@@ -38,11 +79,13 @@
         APPNAME: [
             'data/audio/*.wav',
             'data/locale/*.po',
+            'data/locale/*.mo',
             'data/standard_phrases/*.txt',
             '../plugins/*/*/*.py',
             '../plugins/*/*/plugin.info',
             '../plugins/*/*/*.txt',
             '../plugins/*/*/locale/*.po',
+            '../plugins/*/*/locale/*.mo',
             '../plugins/*/*/tests/*.py'
         ]
     },
@@ -60,5 +103,11 @@
         'console_scripts': [
             'Jasper = %s.main:main' % APPNAME
         ]
+    },
+
+    cmdclass={
+        'bdist_egg': jasper_bdist_egg,
+        'build': jasper_build,
+        'build_i18n': jasper_build_i18n,
     }
 )