fix: RTL unicode issue in PDF

ietf-tools · Aug 29, 2022 · fc13f24 · fc13f24
1 parent 0e7206c
commit fc13f24
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 0 deletions.
diff --git a/test.py b/test.py
@@ -491,6 +491,7 @@ def _pdfwriter(path):
         except Exception as e:
             print(e)
             raise
+        cls.pdf_writer = elements_writer
         cls.elements_root   = elements_writer.root
         cls.elements_pdfxml = xmldoc(None, bytes=elements_pdfdoc)
 
@@ -516,5 +517,10 @@ def test_included_fonts(self):
                 family = xml2rfc.util.fonts.get_noto_serif_family_for_script(script)
                 self.assertIn(family, font_families, 'Missing font match for %s' % script)
 
+    def test_flatten_unicode_spans(self):
+        input_html = '<body><p>f<span class="unicode">o</span>o<span class="unicode">ba</span>r</p></body>'
+        output_html = self.pdf_writer.flatten_unicode_spans(input_html)
+        self.assertEqual(output_html, '<body><p>foobar</p></body>')
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/xml2rfc/writers/pdf.py b/xml2rfc/writers/pdf.py
@@ -5,6 +5,7 @@
 import io
 import logging
 import os
+import re
 
 import warnings
 
@@ -77,6 +78,8 @@ def pdf(self):
         htmlwriter = HtmlWriter(self.xmlrfc, quiet=True, options=self.options, date=self.date)
         html = htmlwriter.html()
 
+        html = self.flatten_unicode_spans(html)
+
         writer = weasyprint.HTML(string=html, base_url="")
 
         cssin  = self.options.css or os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'xml2rfc.css')
@@ -147,6 +150,14 @@ def get_mono_fonts(self):
         self.note(None, "Found installed font: %s" % ', '.join(fonts))
         return fonts
 
+    def flatten_unicode_spans(self, html):
+        # This is a fix for bug in WeasyPrint that doesn't handle RTL unicode
+        # content correctly.
+        # See #873 & Kozea/WeasyPrint#1711
+        return re.sub(r'<span class="unicode">(?P<unicode_content>.*?)</span>',
+                      r'\g<unicode_content>',
+                      html)
+
 
 page_css_template = """
 @media print {{