Skip to content

Commit

Permalink
fix: RTL unicode issue in PDF
Browse files Browse the repository at this point in the history
  • Loading branch information
kesara committed Aug 29, 2022
1 parent 0e7206c commit fc13f24
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
6 changes: 6 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ def _pdfwriter(path):
except Exception as e:
print(e)
raise
cls.pdf_writer = elements_writer
cls.elements_root = elements_writer.root
cls.elements_pdfxml = xmldoc(None, bytes=elements_pdfdoc)

Expand All @@ -516,5 +517,10 @@ def test_included_fonts(self):
family = xml2rfc.util.fonts.get_noto_serif_family_for_script(script)
self.assertIn(family, font_families, 'Missing font match for %s' % script)

def test_flatten_unicode_spans(self):
input_html = '<body><p>f<span class="unicode">o</span>o<span class="unicode">ba</span>r</p></body>'
output_html = self.pdf_writer.flatten_unicode_spans(input_html)
self.assertEqual(output_html, '<body><p>foobar</p></body>')

if __name__ == '__main__':
unittest.main()
11 changes: 11 additions & 0 deletions xml2rfc/writers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import io
import logging
import os
import re

import warnings

Expand Down Expand Up @@ -77,6 +78,8 @@ def pdf(self):
htmlwriter = HtmlWriter(self.xmlrfc, quiet=True, options=self.options, date=self.date)
html = htmlwriter.html()

html = self.flatten_unicode_spans(html)

writer = weasyprint.HTML(string=html, base_url="")

cssin = self.options.css or os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'xml2rfc.css')
Expand Down Expand Up @@ -147,6 +150,14 @@ def get_mono_fonts(self):
self.note(None, "Found installed font: %s" % ', '.join(fonts))
return fonts

def flatten_unicode_spans(self, html):
# This is a fix for bug in WeasyPrint that doesn't handle RTL unicode
# content correctly.
# See #873 & Kozea/WeasyPrint#1711
return re.sub(r'<span class="unicode">(?P<unicode_content>.*?)</span>',
r'\g<unicode_content>',
html)


page_css_template = """
@media print {{
Expand Down

0 comments on commit fc13f24

Please sign in to comment.