-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
setup.py: Add MO translation support
- Loading branch information
Showing
2 changed files
with
336 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,287 @@ | ||
#! /usr/bin/env python | ||
# -*- coding: iso-8859-1 -*- | ||
# Written by Martin v. Loewis <loewis@informatik.hu-berlin.de> | ||
# | ||
# Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless | ||
# translation service (PTS) of Zope | ||
# | ||
# Fixed some bugs and updated to support msgctxt | ||
# by Hanno Schlichting <hanno@hannosch.eu> | ||
|
||
"""Generate binary message catalog from textual translation description. | ||
This program converts a textual Uniforum-style message catalog (.po file) into | ||
a binary GNU catalog (.mo file). This is essentially the same function as the | ||
GNU msgfmt program, however, it is a simpler implementation. | ||
This file was taken from Python-2.3.2/Tools/i18n and altered in several ways. | ||
Now you can simply use it from another python module: | ||
from msgfmt import Msgfmt | ||
mo = Msgfmt(po).get() | ||
where po is path to a po file as string, an opened po file ready for reading or | ||
a list of strings (readlines of a po file) and mo is the compiled mo file as | ||
binary string. | ||
Exceptions: | ||
* IOError if the file couldn't be read | ||
* msgfmt.PoSyntaxError if the po file has syntax errors | ||
""" | ||
|
||
import array | ||
from ast import literal_eval | ||
import codecs | ||
from email.parser import HeaderParser | ||
import struct | ||
import sys | ||
|
||
PY3 = sys.version_info[0] == 3 | ||
if PY3: | ||
def b(s): | ||
return s.encode("latin-1") | ||
|
||
def u(s, enc=None): | ||
return s | ||
|
||
def header_charset(s): | ||
p = HeaderParser() | ||
return p.parsestr(s).get_content_charset() | ||
|
||
import io | ||
BytesIO = io.BytesIO | ||
FILE_TYPE = io.IOBase | ||
else: | ||
def b(s): | ||
return s | ||
|
||
def u(s, enc="unicode_escape"): | ||
return unicode(s, enc) | ||
|
||
def header_charset(s): | ||
p = HeaderParser() | ||
return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset() | ||
|
||
from cStringIO import StringIO as BytesIO | ||
FILE_TYPE = file | ||
|
||
|
||
class PoSyntaxError(Exception): | ||
""" Syntax error in a po file """ | ||
|
||
def __init__(self, msg): | ||
self.msg = msg | ||
|
||
def __str__(self): | ||
return 'Po file syntax error: %s' % self.msg | ||
|
||
|
||
class Msgfmt: | ||
|
||
def __init__(self, po, name='unknown'): | ||
self.po = po | ||
self.name = name | ||
self.messages = {} | ||
self.openfile = False | ||
# Start off assuming latin-1, so everything decodes without failure, | ||
# until we know the exact encoding | ||
self.encoding = 'latin-1' | ||
|
||
def readPoData(self): | ||
""" read po data from self.po and return an iterator """ | ||
output = [] | ||
if isinstance(self.po, str): | ||
output = open(self.po, 'rb') | ||
elif isinstance(self.po, FILE_TYPE): | ||
self.po.seek(0) | ||
self.openfile = True | ||
output = self.po | ||
elif isinstance(self.po, list): | ||
output = self.po | ||
if not output: | ||
raise ValueError("self.po is invalid! %s" % type(self.po)) | ||
if isinstance(output, FILE_TYPE): | ||
# remove BOM from the start of the parsed input | ||
first = output.readline() | ||
if len(first) == 0: | ||
return output.readlines() | ||
if first.startswith(codecs.BOM_UTF8): | ||
first = first.lstrip(codecs.BOM_UTF8) | ||
return [first] + output.readlines() | ||
return output | ||
|
||
def add(self, context, id, string, fuzzy): | ||
"Add a non-empty and non-fuzzy translation to the dictionary." | ||
if string and not fuzzy: | ||
# The context is put before the id and separated by a EOT char. | ||
if context: | ||
id = context + u('\x04') + id | ||
if not id: | ||
# See whether there is an encoding declaration | ||
charset = header_charset(string) | ||
if charset: | ||
# decode header in proper encoding | ||
string = string.encode(self.encoding).decode(charset) | ||
if not PY3: | ||
# undo damage done by literal_eval in Python 2.x | ||
string = string.encode(self.encoding).decode(charset) | ||
self.encoding = charset | ||
self.messages[id] = string | ||
|
||
def generate(self): | ||
"Return the generated output." | ||
# the keys are sorted in the .mo file | ||
keys = sorted(self.messages.keys()) | ||
offsets = [] | ||
ids = strs = b('') | ||
for id in keys: | ||
msg = self.messages[id].encode(self.encoding) | ||
id = id.encode(self.encoding) | ||
# For each string, we need size and file offset. Each string is | ||
# NUL terminated; the NUL does not count into the size. | ||
offsets.append((len(ids), len(id), len(strs), | ||
len(msg))) | ||
ids += id + b('\0') | ||
strs += msg + b('\0') | ||
output = b('') | ||
# The header is 7 32-bit unsigned integers. We don't use hash tables, | ||
# so the keys start right after the index tables. | ||
keystart = 7 * 4 + 16 * len(keys) | ||
# and the values start after the keys | ||
valuestart = keystart + len(ids) | ||
koffsets = [] | ||
voffsets = [] | ||
# The string table first has the list of keys, then the list of values. | ||
# Each entry has first the size of the string, then the file offset. | ||
for o1, l1, o2, l2 in offsets: | ||
koffsets += [l1, o1 + keystart] | ||
voffsets += [l2, o2 + valuestart] | ||
offsets = koffsets + voffsets | ||
# Even though we don't use a hashtable, we still set its offset to be | ||
# binary compatible with the gnu gettext format produced by: | ||
# msgfmt file.po --no-hash | ||
output = struct.pack("Iiiiiii", | ||
0x950412de, # Magic | ||
0, # Version | ||
len(keys), # # of entries | ||
7 * 4, # start of key index | ||
7 * 4 + len(keys) * 8, # start of value index | ||
0, keystart) # size and offset of hash table | ||
if PY3: | ||
output += array.array("i", offsets).tobytes() | ||
else: | ||
output += array.array("i", offsets).tostring() | ||
output += ids | ||
output += strs | ||
return output | ||
|
||
def get(self): | ||
""" """ | ||
self.read() | ||
# Compute output | ||
return self.generate() | ||
|
||
def read(self, header_only=False): | ||
""" """ | ||
ID = 1 | ||
STR = 2 | ||
CTXT = 3 | ||
|
||
section = None | ||
fuzzy = 0 | ||
msgid = msgstr = msgctxt = u('') | ||
|
||
# Parse the catalog | ||
lno = 0 | ||
for l in self.readPoData(): | ||
l = l.decode(self.encoding) | ||
lno += 1 | ||
# If we get a comment line after a msgstr or a line starting with | ||
# msgid or msgctxt, this is a new entry | ||
if section == STR and (l[0] == '#' or (l[0] == 'm' and | ||
(l.startswith('msgctxt') or l.startswith('msgid')))): | ||
self.add(msgctxt, msgid, msgstr, fuzzy) | ||
section = None | ||
fuzzy = 0 | ||
# If we only want the header we stop after the first message | ||
if header_only: | ||
break | ||
# Record a fuzzy mark | ||
if l[:2] == '#,' and 'fuzzy' in l: | ||
fuzzy = 1 | ||
# Skip comments | ||
if l[0] == '#': | ||
continue | ||
# Now we are in a msgctxt section | ||
if l.startswith('msgctxt'): | ||
section = CTXT | ||
l = l[7:] | ||
msgctxt = u('') | ||
# Now we are in a msgid section, output previous section | ||
elif (l.startswith('msgid') and | ||
not l.startswith('msgid_plural')): | ||
if section == STR: | ||
self.add(msgid, msgstr, fuzzy) | ||
section = ID | ||
l = l[5:] | ||
msgid = msgstr = u('') | ||
is_plural = False | ||
# This is a message with plural forms | ||
elif l.startswith('msgid_plural'): | ||
if section != ID: | ||
raise PoSyntaxError('msgid_plural not preceeded by ' | ||
'msgid on line %d of po file %s' % | ||
(lno, repr(self.name))) | ||
l = l[12:] | ||
msgid += b('\0') # separator of singular and plural | ||
is_plural = True | ||
# Now we are in a msgstr section | ||
elif l.startswith('msgstr'): | ||
section = STR | ||
if l.startswith('msgstr['): | ||
if not is_plural: | ||
raise PoSyntaxError('plural without msgid_plural ' | ||
'on line %d of po file %s' % | ||
(lno, repr(self.name))) | ||
l = l.split(']', 1)[1] | ||
if msgstr: | ||
# Separator of the various plural forms | ||
msgstr += b('\0') | ||
else: | ||
if is_plural: | ||
raise PoSyntaxError('indexed msgstr required for ' | ||
'plural on line %d of po file %s' % | ||
(lno, repr(self.name))) | ||
l = l[6:] | ||
# Skip empty lines | ||
l = l.strip() | ||
if not l: | ||
continue | ||
# TODO: Does this always follow Python escape semantics? | ||
try: | ||
l = literal_eval(l) | ||
except Exception as msg: | ||
raise PoSyntaxError('%s (line %d of po file %s): \n%s' % | ||
(msg, lno, repr(self.name), l)) | ||
l = u(l, self.encoding) | ||
if section == CTXT: | ||
msgctxt += l | ||
elif section == ID: | ||
msgid += l | ||
elif section == STR: | ||
msgstr += l | ||
else: | ||
raise PoSyntaxError('error on line %d of po file %s' % | ||
(lno, repr(self.name))) | ||
|
||
# Add last entry | ||
if section == STR: | ||
self.add(msgctxt, msgid, msgstr, fuzzy) | ||
|
||
if self.openfile: | ||
self.po.close() | ||
|
||
def getAsFile(self): | ||
return BytesIO(self.get()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters