First version with passing tests on Python 2.6, 2.7 and 3.2

hannosch · Dec 22, 2011 · 6945966 · 6945966
1 parent 548c0b0
commit 6945966
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 63 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,9 +1,11 @@
 Changelog
 =========
 
-1.3 - unreleased
+2.0 - unreleased
 ----------------
 
+- Python 2 and 3 compatibility in the same codebase.
+  [hannosch]
 
 1.2 - 2011-11-01
 ----------------

diff --git a/pythongettext/msgfmt.py b/pythongettext/msgfmt.py
@@ -33,6 +33,7 @@
 
 import array
 import codecs
+from email.parser import HeaderParser
 import struct
 import sys
 
@@ -41,18 +42,26 @@
     def b(s):
         return s.encode("latin-1")
 
-    def u(s):
+    def u(s, enc=None):
         return s
 
+    def header_charset(s):
+        p = HeaderParser()
+        return p.parsestr(s).get_content_charset()
+
     import io
     BytesIO = io.BytesIO
     FILE_TYPE = io.IOBase
 else:
     def b(s):
         return s
 
-    def u(s):
-        return unicode(s, "unicode_escape")
+    def u(s, enc="unicode_escape"):
+        return unicode(s, enc)
+
+    def header_charset(s):
+        p = HeaderParser()
+        return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset()
 
     from cStringIO import StringIO as BytesIO
     FILE_TYPE = file
@@ -75,6 +84,9 @@ def __init__(self, po, name='unknown'):
         self.name = name
         self.messages = {}
         self.openfile = False
+        # Start off assuming latin-1, so everything decodes without failure,
+        # until we know the exact encoding
+        self.encoding = 'latin-1'
 
     def readPoData(self):
         """ read po data from self.po and return an iterator """
@@ -99,13 +111,18 @@ def readPoData(self):
             return [first] + output.readlines()
         return output
 
-    def add(self, context, id, str, fuzzy):
+    def add(self, context, id, string, fuzzy):
         "Add a non-empty and non-fuzzy translation to the dictionary."
-        if str and not fuzzy:
+        if string and not fuzzy:
             # The context is put before the id and separated by a EOT char.
             if context:
-                id = context + '\x04' + id
-            self.messages[id] = str
+                id = context + u('\x04') + id
+            self.messages[id] = string
+            if not id:
+                # See whether there is an encoding declaration
+                charset = header_charset(string)
+                if charset:
+                    self.encoding = charset
 
     def generate(self):
         "Return the generated output."
@@ -114,12 +131,14 @@ def generate(self):
         offsets = []
         ids = strs = b('')
         for id in keys:
+            msg = self.messages[id].encode(self.encoding)
+            id = id.encode(self.encoding)
             # For each string, we need size and file offset. Each string is
             # NUL terminated; the NUL does not count into the size.
             offsets.append((len(ids), len(id), len(strs),
-                            len(self.messages[id])))
+                            len(msg)))
             ids += id + b('\0')
-            strs += self.messages[id] + b('\0')
+            strs += msg + b('\0')
         output = b('')
         # The header is 7 32-bit unsigned integers. We don't use hash tables,
         # so the keys start right after the index tables.
@@ -144,7 +163,10 @@ def generate(self):
                              7 * 4,             # start of key index
                              7 * 4 + len(keys) * 8,  # start of value index
                              0, keystart)       # size and offset of hash table
-        output += array.array("i", offsets).tostring()
+        if PY3:
+            output += array.array("i", offsets).tobytes()
+        else:
+            output += array.array("i", offsets).tostring()
         output += ids
         output += strs
         return output
@@ -163,17 +185,17 @@ def read(self, header_only=False):
 
         section = None
         fuzzy = 0
-        msgid = msgstr = msgctxt = ''
+        msgid = msgstr = msgctxt = u('')
 
         # Parse the catalog
         lno = 0
         for l in self.readPoData():
+            l = l.decode(self.encoding)
             lno += 1
             # If we get a comment line after a msgstr or a line starting with
             # msgid or msgctxt, this is a new entry
             if section == STR and (l[0] == '#' or (l[0] == 'm' and
                (l.startswith('msgctxt') or l.startswith('msgid')))):
-
                 self.add(msgctxt, msgid, msgstr, fuzzy)
                 section = None
                 fuzzy = 0
@@ -187,47 +209,46 @@ def read(self, header_only=False):
             if l[0] == '#':
                 continue
             # Now we are in a msgctxt section
-            elif l[0] == 'm':
-                if l.startswith('msgctxt'):
-                    section = CTXT
-                    l = l[7:]
-                    msgctxt = ''
-                # Now we are in a msgid section, output previous section
-                elif (l.startswith('msgid') and
-                      not l.startswith('msgid_plural')):
-                    if section == STR:
-                        self.add(msgid, msgstr, fuzzy)
-                    section = ID
-                    l = l[5:]
-                    msgid = msgstr = ''
-                    is_plural = False
-                # This is a message with plural forms
-                elif l.startswith('msgid_plural'):
-                    if section != ID:
-                        raise PoSyntaxError('msgid_plural not preceeded by '
-                            'msgid on line %d of po file %s' %
+            if l.startswith('msgctxt'):
+                section = CTXT
+                l = l[7:]
+                msgctxt = u('')
+            # Now we are in a msgid section, output previous section
+            elif (l.startswith('msgid') and
+                  not l.startswith('msgid_plural')):
+                if section == STR:
+                    self.add(msgid, msgstr, fuzzy)
+                section = ID
+                l = l[5:]
+                msgid = msgstr = u('')
+                is_plural = False
+            # This is a message with plural forms
+            elif l.startswith('msgid_plural'):
+                if section != ID:
+                    raise PoSyntaxError('msgid_plural not preceeded by '
+                        'msgid on line %d of po file %s' %
+                        (lno, repr(self.name)))
+                l = l[12:]
+                msgid += b('\0')  # separator of singular and plural
+                is_plural = True
+            # Now we are in a msgstr section
+            elif l.startswith('msgstr'):
+                section = STR
+                if l.startswith('msgstr['):
+                    if not is_plural:
+                        raise PoSyntaxError('plural without msgid_plural '
+                            'on line %d of po file %s' %
+                            (lno, repr(self.name)))
+                    l = l.split(']', 1)[1]
+                    if msgstr:
+                        # Separator of the various plural forms
+                        msgstr += b('\0')
+                else:
+                    if is_plural:
+                        raise PoSyntaxError('indexed msgstr required for '
+                            'plural on line %d of po file %s' %
                             (lno, repr(self.name)))
-                    l = l[12:]
-                    msgid += '\0'  # separator of singular and plural
-                    is_plural = True
-                # Now we are in a msgstr section
-                elif l.startswith('msgstr'):
-                    section = STR
-                    if l.startswith('msgstr['):
-                        if not is_plural:
-                            raise PoSyntaxError('plural without msgid_plural '
-                                'on line %d of po file %s' %
-                                (lno, repr(self.name)))
-                        l = l.split(']', 1)[1]
-                        if msgstr:
-                            # Separator of the various plural forms
-                            msgstr += '\0'
-                    else:
-                        if is_plural:
-                            raise PoSyntaxError('indexed msgstr required for '
-                                'plural on line %d of po file %s' %
-                                (lno, repr(self.name)))
-                        l = l[6:]
+                    l = l[6:]
             # Skip empty lines
             l = l.strip()
             if not l:
@@ -238,6 +259,7 @@ def read(self, header_only=False):
             except Exception as msg:
                 raise PoSyntaxError('%s (line %d of po file %s): \n%s' %
                     (msg, lno, repr(self.name), l))
+            l = u(l, self.encoding)
             if section == CTXT:
                 msgctxt += l
             elif section == ID:

diff --git a/pythongettext/tests/test_compile.py b/pythongettext/tests/test_compile.py
@@ -35,7 +35,7 @@ def compare_po_mo(self, poname, moname):
             po_file = open(os.path.join(FOLDER, poname), 'rb')
             po = Msgfmt(po_file).get()
             mo_file = open(os.path.join(FOLDER, moname), 'rb')
-            mo = ''.join(mo_file.readlines())
+            mo = b('').join(mo_file.readlines())
         finally:
             if po_file is not None:
                 po_file.close()
@@ -60,26 +60,41 @@ def test_test4(self):
         po_file = open(os.path.join(FOLDER, 'test4.po'), 'rb')
         po = Msgfmt(po_file)
         po.read(header_only=True)
-        self.assertTrue(po.messages[''].startswith('Project-Id-Version: foo'))
+        po_file.close()
+        self.assertTrue(
+            po.messages[u('')].startswith('Project-Id-Version: foo'))
+        self.assertEqual(po.encoding, u('iso-8859-1'))
 
     def test_test5(self):
         po_file = open(os.path.join(FOLDER, 'test5.po'), 'rb')
         po = Msgfmt(po_file)
-        with self.assertRaises(PoSyntaxError):
-            po.read()
+        try:
+            with self.assertRaises(PoSyntaxError):
+                po.read()
+        finally:
+            po_file.close()
+        self.assertEqual(po.encoding, u('utf-8'))
 
     def test_test5_unicode_name(self):
         po_file = open(os.path.join(FOLDER, 'test5.po'), 'rb')
         po = Msgfmt(po_file, name=u('dømain', 'utf-8'))
-        with self.assertRaises(PoSyntaxError):
-            po.read()
+        try:
+            with self.assertRaises(PoSyntaxError):
+                po.read()
+        finally:
+            po_file.close()
+        self.assertEqual(po.encoding, u('utf-8'))
 
     def test_escape(self):
         po_file = open(os.path.join(FOLDER, 'test_escape.po'), 'rb')
         po = Msgfmt(po_file)
-        with self.assertRaises(PoSyntaxError) as e:
-            po.read()
-        self.assertTrue('line 19' in e.exception.msg)
+        try:
+            with self.assertRaises(PoSyntaxError) as e:
+                po.read()
+            self.assertTrue('line 19' in e.exception.msg)
+            self.assertEqual(po.encoding, u('utf-8'))
+        finally:
+            po_file.close()
 
     def test_unicode_bom(self):
         self.compare_po_mo('test_unicode_bom.po', 'test_unicode_bom.mo')
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 import sys
 from setuptools import setup
 
-version = '1.3dev'
+version = '2.0dev'
 
 PY3 = sys.version_info[0] == 3
 install_requires = []
@@ -28,6 +28,8 @@
         'Programming Language :: Python',
         'Programming Language :: Python :: 2.6',
         'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
         'Topic :: Software Development :: Internationalization',
         'Topic :: Software Development :: Localization',
       ],