Permalink
Browse files

Merge pull request #3 from jlward/issue_3

Move over the docx tests (not the xml ones)
  • Loading branch information...
2 parents 6a624e5 + bdf3c79 commit a7dd82b6ac002af33d8c01c150396e333e304367 @jlward committed Mar 22, 2013
Showing with 984 additions and 30 deletions.
  1. +1 −1 .travis.yml
  2. +21 −13 pydocx/DocxParser.py
  3. BIN pydocx/fixtures/attachment_is_tiff.docx
  4. BIN pydocx/fixtures/bigger_font_size_to_header.docx
  5. BIN pydocx/fixtures/convert_p_to_h.docx
  6. BIN pydocx/fixtures/fake_headings_by_length.docx
  7. BIN pydocx/fixtures/greek_alphabet.docx
  8. BIN pydocx/fixtures/has_image.docx
  9. BIN pydocx/fixtures/has_missing_image.docx
  10. BIN pydocx/fixtures/has_title.docx
  11. BIN pydocx/fixtures/header_footer_problem.docx
  12. BIN pydocx/fixtures/headers.docx
  13. BIN pydocx/fixtures/headers_with_full_line_styles.docx
  14. BIN pydocx/fixtures/inline_tags.docx
  15. BIN pydocx/fixtures/list_in_table.docx
  16. BIN pydocx/fixtures/list_to_header.docx
  17. BIN pydocx/fixtures/lists_with_styles.docx
  18. BIN pydocx/fixtures/missing_content.docx
  19. BIN pydocx/fixtures/nested_lists.docx
  20. BIN pydocx/fixtures/nested_table_rowspan.docx
  21. BIN pydocx/fixtures/nested_tables.docx
  22. BIN pydocx/fixtures/resized_image.docx
  23. BIN pydocx/fixtures/shift_enter.docx
  24. BIN pydocx/fixtures/simple.docx
  25. BIN pydocx/fixtures/simple_lists.docx
  26. BIN pydocx/fixtures/special_chars.docx
  27. BIN pydocx/fixtures/split_header.docx
  28. BIN pydocx/fixtures/table_col_row_span.docx
  29. BIN pydocx/fixtures/tables_in_lists.docx
  30. BIN pydocx/fixtures/track_changes_on.docx
  31. BIN pydocx/fixtures/upper_alpha_all_bold.docx
  32. +10 −10 pydocx/lxmlparser.py
  33. +16 −6 pydocx/parsers/Docx2Html.py
  34. +123 −0 pydocx/tests/__init__.py
  35. +809 −0 pydocx/tests/test_docx.py
  36. +1 −0 requirements.txt
  37. +3 −0 run_tests.sh
View
2 .travis.yml
@@ -2,7 +2,7 @@ language: python
python:
- "2.6"
- "2.7"
-script: python main.py
+script: ./run_tests.sh
install:
- pip install -r requirements.txt
notifications:
View
34 pydocx/DocxParser.py
@@ -66,11 +66,11 @@ def __init__(self, path):
self.document_text = f.read('word/document.xml')
try:
self.numbering_text = f.read('word/numbering.xml')
- except zipfile.BadZipfile:
+ except KeyError:
pass
try:
self.comment_text = f.read('word/comments.xml')
- except zipfile.BadZipfile:
+ except KeyError:
pass
finally:
f.close()
@@ -172,7 +172,7 @@ def parse_lists(self, el):
if lst_style['val'] == 'bullet':
parsed += self.unordered_list(chunk_parsed)
else:
- parsed += self.ordered_list(chunk_parsed)
+ parsed += self.ordered_list(chunk_parsed, lst_style['val'])
elif chunk[0].has_child_all('br'):
parsed += self.page_break()
else:
@@ -191,46 +191,54 @@ def parse(self, el):
'tbl' in tmp_d and
el.parent_list[tmp_d['tbl']] not in self.tables_seen):
self.ignore_current = True
- self.tables_seen.append(el.parent_list[tmp_d['tbl']])
- tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
+ tbl = el.parent_list[tmp_d['tbl']]
+ self.tables_seen.append(tbl)
+ tmpout = self.table(self.parse(tbl))
self.ignore_current = False
+
+ # Need to keep track of visited trs and tcs
+ self.visited.extend(
+ e for e in el_iter(tbl)
+ if e.tag in ['tr', 'tc']
+ )
return tmpout
for child in el:
parsed += self.parse(child)
- if el.tag == 'br' and el.attrib['type'] == 'page':
+ if el.tag == 'br' and el.attrib.get('type') == 'page':
#TODO figure out what parsed is getting overwritten
return self.page_break()
- # add it to the list so we don't repeat!
+ # Add it to the list so we don't repeat!
if el.tag == 'ilvl' and el not in self.visited:
self.in_list = True
self.visited.append(el)
## This starts the returns
- elif el.tag == 'tr':
+ # Do not do the tr or tc a second time
+ elif el.tag == 'tr' and el not in self.visited:
return self.table_row(parsed)
- elif el.tag == 'tc':
+ elif el.tag == 'tc' and el not in self.visited:
self.elements.append(el)
return self.table_cell(parsed)
if el.tag == 'r' and el not in self.elements:
self.elements.append(el)
return self.parse_r(el)
elif el.tag == 'p':
+ if el.parent.tag == 'tc':
+ return parsed
return self.parse_p(el, parsed)
elif el.tag == 'ins':
return self.insertion(parsed, '', '')
else:
return parsed
def parse_p(self, el, text):
+ if text == '':
+ return ''
parsed = text
if self.in_list:
self.in_list = False
parsed = self.list_element(parsed)
- elif (
- not el.has_child_all('t') and
- 'tbl' not in [i.tag for i in el.parent_list]):
- parsed = self.linebreak()
elif el.parent not in self.elements:
parsed = self.paragraph(parsed)
return parsed
View
BIN pydocx/fixtures/attachment_is_tiff.docx
Binary file not shown.
View
BIN pydocx/fixtures/bigger_font_size_to_header.docx
Binary file not shown.
View
BIN pydocx/fixtures/convert_p_to_h.docx
Binary file not shown.
View
BIN pydocx/fixtures/fake_headings_by_length.docx
Binary file not shown.
View
BIN pydocx/fixtures/greek_alphabet.docx
Binary file not shown.
View
BIN pydocx/fixtures/has_image.docx
Binary file not shown.
View
BIN pydocx/fixtures/has_missing_image.docx
Binary file not shown.
View
BIN pydocx/fixtures/has_title.docx
Binary file not shown.
View
BIN pydocx/fixtures/header_footer_problem.docx
Binary file not shown.
View
BIN pydocx/fixtures/headers.docx
Binary file not shown.
View
BIN pydocx/fixtures/headers_with_full_line_styles.docx
Binary file not shown.
View
BIN pydocx/fixtures/inline_tags.docx
Binary file not shown.
View
BIN pydocx/fixtures/list_in_table.docx
Binary file not shown.
View
BIN pydocx/fixtures/list_to_header.docx
Binary file not shown.
View
BIN pydocx/fixtures/lists_with_styles.docx
Binary file not shown.
View
BIN pydocx/fixtures/missing_content.docx
Binary file not shown.
View
BIN pydocx/fixtures/nested_lists.docx
Binary file not shown.
View
BIN pydocx/fixtures/nested_table_rowspan.docx
Binary file not shown.
View
BIN pydocx/fixtures/nested_tables.docx
Binary file not shown.
View
BIN pydocx/fixtures/resized_image.docx
Binary file not shown.
View
BIN pydocx/fixtures/shift_enter.docx
Binary file not shown.
View
BIN pydocx/fixtures/simple.docx
Binary file not shown.
View
BIN pydocx/fixtures/simple_lists.docx
Binary file not shown.
View
BIN pydocx/fixtures/special_chars.docx
Binary file not shown.
View
BIN pydocx/fixtures/split_header.docx
Binary file not shown.
View
BIN pydocx/fixtures/table_col_row_span.docx
Binary file not shown.
View
BIN pydocx/fixtures/tables_in_lists.docx
Binary file not shown.
View
BIN pydocx/fixtures/track_changes_on.docx
Binary file not shown.
View
BIN pydocx/fixtures/upper_alpha_all_bold.docx
Binary file not shown.
View
20 pydocx/lxmlparser.py
@@ -9,15 +9,15 @@
# visited already.
#if el in visited_nodes:
#continue
-with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f:
- document = f.read('word/document.xml')
- numbering= f.read('word/numbering.xml')
-parser=etree.XMLParser(ns_clean=True)
-document=StringIO(document)
-numbering=StringIO(numbering)
-numbering_tree=etree.parse(numbering,parser)
-numbering_namespace=numbering_tree.getroot().nsmap['w']
-visited_els=[]
+#with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f:
+# document = f.read('word/document.xml')
+# numbering= f.read('word/numbering.xml')
+#parser=etree.XMLParser(ns_clean=True)
+#document=StringIO(document)
+#numbering=StringIO(numbering)
+#numbering_tree=etree.parse(numbering,parser)
+#numbering_namespace=numbering_tree.getroot().nsmap['w']
+#visited_els=[]
def get_parsed():
parser=etree.XMLParser(ns_clean=True)
@@ -108,4 +108,4 @@ def get_list_style(numval):
if i.find('{%s}numFmt' %numbering_namespace) is not None:
return i.find('{%s}numFmt' %numbering_namespace).attrib
-print get_parsed()
+#print get_parsed()
View
22 pydocx/parsers/Docx2Html.py
@@ -11,11 +11,21 @@ def parsed(self):
self._parsed = self._parsed.replace('</p><br /><p>', '</p><p>')
self._parsed = self._parsed.replace('</p><br /><ul>', '</p><ul>')
return (
- '<html><head><style>.insert{{color:red}}.delete'
- '{{color:red; text-decoration:line-through}}.center'
- '{{text-align:center}}.right{{text-align:right}}'
- '</style></head><body>{content}</body></html>'
- ).format(content=self._parsed)
+ '<html>{head}<body>{content}</body></html>'
+ ).format(
+ head=self.head(),
+ content=self._parsed,
+ )
+
+ def head(self):
+ return '<head>{style}</head>'.format(
+ style=self.style(),
+ )
+
+ def style(self):
+ return '<style>.insert{{color:red}}.delete'
+ '{{color:red; text-decoration:line-through}}.center'
+ '{{text-align:center}}.right{{text-align:right}}</style>'
def escape(self, text):
return xml.sax.saxutils.quoteattr(text)[1:-1]
@@ -41,7 +51,7 @@ def deletion(self, text, author, date):
def list_element(self, text):
return "<li>{text}</li>".format(text=text)
- def ordered_list(self, text):
+ def ordered_list(self, text, list_style):
return "<ol>{text}</ol>".format(text=text)
def unordered_list(self, text):
View
123 pydocx/tests/__init__.py
@@ -0,0 +1,123 @@
+#from unittest import TestCase
+import re
+
+#from docx2html.core import (
+# MetaData,
+# create_html,
+#)
+
+
+def assert_html_equal(actual_html, expected_html):
+ assert collapse_html(
+ actual_html,
+ ) == collapse_html(
+ expected_html
+ ), actual_html
+
+
+def collapse_html(html):
+ """
+ Remove insignificant whitespace from the html.
+
+ >>> print collapse_html('''\\
+ ... <h1>
+ ... Heading
+ ... </h1>
+ ... ''')
+ <h1>Heading</h1>
+ >>> print collapse_html('''\\
+ ... <p>
+ ... Paragraph with
+ ... multiple lines.
+ ... </p>
+ ... ''')
+ <p>Paragraph with multiple lines.</p>
+ """
+ def smart_space(match):
+ # Put a space in between lines, unless exactly one side of the line
+ # break butts up against a tag.
+ before = match.group(1)
+ after = match.group(2)
+ space = ' '
+ if before == '>' or after == '<':
+ space = ''
+ return before + space + after
+ # Replace newlines and their surrounding whitespace with a single space (or
+ # empty string)
+ html = re.sub(
+ r'(>?)\s*\n\s*(<?)',
+ smart_space,
+ html,
+ )
+ return html.strip()
+
+
+#DEFAULT_NUMBERING_DICT = {
+# '1': {
+# 0: 'decimal',
+# 1: 'decimal',
+# },
+# '2': {
+# 0: 'none',
+# 1: 'none',
+# },
+#}
+#DEFAULT_RELATIONSHIP_DICT = {
+# 'rId3': 'fontTable.xml',
+# 'rId2': 'numbering.xml',
+# 'rId1': 'styles.xml',
+#}
+#DEFAULT_STYLES_DICT = {
+# 'style0': {
+# 'header': False,
+# 'font_size': '24',
+# 'based_on': None,
+# },
+#}
+#DEFAULT_FONT_SIZES_DICT = {
+# '24': None,
+#}
+#
+#
+#def image_handler(*args, **kwargs):
+# return 'test'
+#DEFAULT_IMAGE_HANDLER = image_handler
+#DEFAULT_IMAGE_SIZES = {}
+#
+#
+## This is a base test case defining methods to generate the xml and the meta
+## data for each test case.
+#class _TranslationTestCase(TestCase):
+# expected_output = None
+# numbering_dict = DEFAULT_NUMBERING_DICT
+# relationship_dict = DEFAULT_RELATIONSHIP_DICT
+# styles_dict = DEFAULT_STYLES_DICT
+# font_sizes_dict = DEFAULT_FONT_SIZES_DICT
+# image_handler = DEFAULT_FONT_SIZES_DICT
+# image_sizes = DEFAULT_IMAGE_SIZES
+#
+# def get_xml(self):
+# raise NotImplementedError()
+#
+# def get_meta_data(self):
+# return MetaData(
+# numbering_dict=self.numbering_dict,
+# relationship_dict=self.relationship_dict,
+# styles_dict=self.styles_dict,
+# font_sizes_dict=self.font_sizes_dict,
+# image_handler=self.image_handler,
+# image_sizes=self.image_sizes,
+# )
+#
+# def test_expected_output(self):
+# if self.expected_output is None:
+# raise AssertionError('expected_output is not defined')
+#
+# # Create the xml
+# tree = self.get_xml()
+# meta_data = self.get_meta_data()
+#
+# # Verify the final output.
+# html = create_html(tree, meta_data)
+#
+# assert_html_equal(html, self.expected_output)
View
809 pydocx/tests/test_docx.py
@@ -0,0 +1,809 @@
+#import mock
+import tempfile
+import shutil
+from os import path
+#from zipfile import ZipFile
+from nose.plugins.skip import SkipTest
+#from nose.tools import assert_raises
+
+from pydocx.tests import collapse_html
+from pydocx.parsers.Docx2Html import Docx2Html
+
+
+class TestDocx2HTML(Docx2Html):
+ def head(self):
+ return ''
+
+ def table(self, text):
+ return '<table>' + text + '</table>'
+
+ def ordered_list(self, text, list_style):
+ list_type_conversions = {
+ 'decimal': 'decimal',
+ 'decimalZero': 'decimal-leading-zero',
+ 'upperRoman': 'upper-roman',
+ 'lowerRoman': 'lower-roman',
+ 'upperLetter': 'upper-alpha',
+ 'lowerLetter': 'lower-alpha',
+ 'ordinal': 'decimal',
+ 'cardinalText': 'decimal',
+ 'ordinalText': 'decimal',
+ }
+ return '<ol data-list-type="{list_style}">{text}</ol>'.format(
+ list_style=list_type_conversions.get(list_style, 'decimal'),
+ text=text,
+ )
+
+
+def convert(path):
+ return TestDocx2HTML(path).parsed
+
+
+def assert_html_equal(actual_html, expected_html):
+ assert collapse_html(
+ actual_html,
+ ) == collapse_html(
+ expected_html
+ ), actual_html
+
+
+def test_extract_html():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <p>
+ Simple text
+ </p>
+ <ol data-list-type="decimal">
+ <li>one</li>
+ <li>two</li>
+ <li>three</li>
+ </ol>
+ <table>
+ <tr>
+ <td>Cell1</td>
+ <td>Cell2</td>
+ </tr>
+ <tr>
+ <td>Cell3</td>
+ <td>cell4</td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_nested_list():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <ol data-list-type="decimal">
+ <li>one</li>
+ <li>two</li>
+ <li>three
+ <ol data-list-type="decimal">
+ <li>AAA</li>
+ <li>BBB</li>
+ <li>CCC
+ <ol data-list-type="decimal">
+ <li>alpha</li>
+ </ol>
+ </li>
+ </ol>
+ </li>
+ <li>four</li>
+ </ol>
+ <ol data-list-type="decimal">
+ <li>xxx
+ <ol data-list-type="decimal">
+ <li>yyy</li>
+ </ol>
+ </li>
+ </ol>
+ <ul>
+ <li>www
+ <ul>
+ <li>zzz</li>
+ </ul>
+ </li>
+ </ul>
+ </body></html>
+ ''')
+
+
+def test_simple_list():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <ol data-list-type="decimal">
+ <li>One</li>
+ </ol>
+ <ul>
+ <li>two</li>
+ </ul>
+ </body></html>
+ ''')
+
+
+def test_inline_tags():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'inline_tags.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body><p>This sentence has some <strong>bold</strong>, some <em>italics</em> and some <strong>underline</strong>, as well as a <a href="http://www.google.com/">hyperlink</a>.</p></body></html>''') # noqa
+
+
+def test_unicode():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'greek_alphabet.docx',
+ )
+ actual_html = convert(file_path)
+ assert actual_html is not None
+
+
+def test_special_chars():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'special_chars.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body><p>&amp; &lt; &gt; <a href="https://www.google.com/?test=1&amp;more=2">link</a></p></body></html>''') # noqa
+
+
+def test_table_col_row_span():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'table_col_row_span.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <table>
+ <tr>
+ <td colspan="2">AAA</td>
+ </tr>
+ <tr>
+ <td rowspan="2">BBB</td>
+ <td>CCC</td>
+ </tr>
+ <tr>
+ <td>DDD</td>
+ </tr>
+ <tr>
+ <td>EEE</td>
+ <td rowspan="2">FFF</td>
+ </tr>
+ <tr>
+ <td>GGG</td>
+ </tr>
+ </table>
+ <table>
+ <tr>
+ <td>1</td>
+ <td>2</td>
+ <td>3</td>
+ <td>4</td>
+ </tr>
+ <tr>
+ <td>5</td>
+ <td colspan="2" rowspan="2">6</td>
+ <td>7</td>
+ </tr>
+ <tr>
+ <td>8</td>
+ <td>9</td>
+ </tr>
+ <tr>
+ <td>10</td>
+ <td>11</td>
+ <td>12</td>
+ <td>13</td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_nested_table_rowspan():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_table_rowspan.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <table>
+ <tr>
+ <td colspan="2">AAA</td>
+ </tr>
+ <tr>
+ <td>BBB</td>
+ <td>
+ <table>
+ <tr>
+ <td rowspan="2">CCC</td>
+ <td>DDD</td>
+ </tr>
+ <tr>
+ <td>EEE</td>
+ </tr>
+ </table>
+ <br />
+ </td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_nested_tables():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_tables.docx',
+ )
+ actual_html = convert(file_path)
+ # Find out why br tag is there.
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <table>
+ <tr>
+ <td>AAA</td>
+ <td>BBB</td>
+ </tr>
+ <tr>
+ <td>CCC</td>
+ <td>
+ <table>
+ <tr>
+ <td>DDD</td>
+ <td>EEE</td>
+ </tr>
+ <tr>
+ <td>FFF</td>
+ <td>GGG</td>
+ </tr>
+ </table>
+ <br />
+ </td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_list_in_table():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_in_table.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <table>
+ <tr>
+ <td>
+ <ol data-list-type="decimal">
+ <li>AAA</li>
+ <li>BBB</li>
+ <li>CCC</li>
+ </ol>
+ </td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_tables_in_lists():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'tables_in_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <ol data-list-type="decimal">
+ <li>AAA</li>
+ <li>BBB<br />
+ <table>
+ <tr>
+ <td>CCC</td>
+ <td>DDD</td>
+ </tr>
+ <tr>
+ <td>EEE</td>
+ <td>FFF</td>
+ </tr>
+ </table>
+ </li>
+ <li>GGG</li>
+ </ol>
+ </body></html>
+ ''')
+
+
+def test_track_changes_on():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'track_changes_on.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body><p>This was some content.</p></body></html>
+ ''')
+
+
+def test_headers():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>This is an H1</h2>
+ <h3>This is an H2</h3>
+ <h4>This is an H3</h4>
+ <h5>This is an H4</h5>
+ <h6>This is an H5</h6>
+ <h6>This is an H6</h6>
+ <h6>This is an H7</h6>
+ <h6>This is an H8</h6>
+ <h6>This is an H9</h6>
+ <h6>This is an H10</h6>
+ </body></html>
+ ''')
+
+
+def _copy_file_to_tmp_dir(file_path, filename):
+ # Since the images need to be extracted from the docx, copy the file to a
+ # temp directory so we do not clutter up repo.
+ dp = tempfile.mkdtemp()
+ new_file_path = path.join(dp, filename)
+ shutil.copyfile(file_path, new_file_path)
+ return new_file_path, dp
+
+
+def test_split_headers():
+ raise SkipTest('This test is not yet passing')
+ filename = 'split_header.docx'
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'split_header.docx',
+ )
+ # preserve_images must be true in order for the image to not be removed.
+ # This is handled in build_import, however here we need to manually set it
+ # to True.
+ new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+
+ def image_handler(*args, **kwargs):
+ return 'test'
+ actual_html = convert(new_file_path, image_handler=image_handler)
+ assert_html_equal(actual_html, '''
+ <html><body><h2>AAA</h2><p>BBB</p><h2>CCC</h2></body></html>
+ ''')
+
+
+def test_has_image():
+ raise SkipTest('This test is not yet passing')
+ filename = 'has_image.docx'
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+ # preserve_images must be true in order for the image to not be removed.
+ # This is handled in build_import, however here we need to manually set it
+ # to True.
+ new_file_path, dp = _copy_file_to_tmp_dir(file_path, filename)
+
+ actual_html = convert(new_file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <p>AAA<img src="%s/word/media/image1.gif" height="55" width="260" /></p>
+ </body></html>
+ ''' % dp)
+
+
+def test_has_image_using_image_handler():
+ raise SkipTest('This test is not yet passing')
+ filename = 'has_image.docx'
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+ # preserve_images must be true in order for the image to not be removed.
+ # This is handled in build_import, however here we need to manually set it
+ # to True.
+ new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+
+ def image_handler(*args, **kwargs):
+ return 'test'
+ actual_html = convert(new_file_path, image_handler=image_handler)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <p>AAA<img src="test" height="55" width="260" /></p>
+ </body></html>
+ ''')
+
+
+#def test_attachment_is_tiff():
+# filename = 'attachment_is_tiff.docx'
+# file_path = path.join(
+# path.abspath(path.dirname(__file__)),
+# '..',
+# 'fixtures',
+# 'attachment_is_tiff.docx',
+# )
+# # preserve_images must be true in order for the image to not be removed.
+# # This is handled in build_import, however here we need to manually set it
+# # to True.
+# new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+#
+# # First open the file and verify that the image attachment is a tiff.
+# try:
+# zf = ZipFile(new_file_path)
+# # Get the document data.
+# _, meta_data = _get_document_data(zf)
+# finally:
+# zf.close()
+# # Find the path to the image.
+# image_file = None
+# for file_path in meta_data.relationship_dict.values():
+# if file_path.endswith('.gif'):
+# image_file = file_path
+# assert image_file is not None
+# with open(image_file) as f:
+# magic_number = f.read()[:4]
+# # Make sure the image is actually a gif.
+# assert magic_number == 'GIF8'
+
+
+def test_headers_with_full_line_styles():
+ raise SkipTest('This test is not yet passing')
+ # Show that if a natural header is completely bold/italics that
+ # bold/italics will get stripped out.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers_with_full_line_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>AAA</h2>
+ <h2>BBB</h2>
+ <h2><strong>C</strong><em>C</em>C</h2>
+ </body></html>
+ ''')
+
+
+def test_convert_p_to_h():
+ raise SkipTest('This test is not yet passing')
+ # Show when it is correct to convert a p tag to an h tag based on
+ # bold/italics
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'convert_p_to_h.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>AAA</h2>
+ <h2>BBB</h2>
+ <p>CCC</p>
+ <ol data-list-type="decimal">
+ <li><strong>DDD</strong></li>
+ <li><em>EEE</em></li>
+ <li>FFF</li>
+ </ol>
+ <table>
+ <tr>
+ <td><strong>GGG</strong></td>
+ <td><em>HHH</em></td>
+ </tr>
+ <tr>
+ <td>III</td>
+ <td>JJJ</td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+#def test_bigger_font_size_to_header():
+# # Show when it is appropriate to convert p tags to h tags based on font
+# # size.
+# if not DETECT_FONT_SIZE:
+# raise SkipTest('Font size detection is disabled.')
+# file_path = path.join(
+# path.abspath(path.dirname(__file__)),
+# '..',
+# 'fixtures',
+# 'bigger_font_size_to_header.docx',
+# )
+# actual_html = convert(file_path)
+# assert_html_equal(actual_html, '''
+# <html>
+# <p>Paragraphs:</p>
+# <h2>Header</h2>
+# <p>paragraph 1</p>
+# <p>Lists:</p>
+# <ol data-list-type="decimal">
+# <li>bigger</li>
+# <li>smaller</li>
+# </ol>
+# <p>Tables:</p>
+# <table>
+# <tr>
+# <td>bigger</td>
+# <td>smaller</td>
+# </tr>
+# </table>
+# </html>
+# ''')
+
+
+def test_fake_headings_by_length():
+ raise SkipTest('This test is not yet passing')
+ # Show that converting p tags to h tags has a length limit. If the p tag is
+ # supposed to be converted to an h tag but has more than seven words in the
+ # paragraph do not convert it.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'fake_headings_by_length.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>Heading.</h2>
+ <h2>Still a heading.</h2>
+ <p>
+ <strong>This is not a heading because it is too many words.</strong>
+ </p>
+ </body></html>
+ ''')
+
+
+def test_shift_enter():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'shift_enter.docx',
+ )
+
+ # Test just the convert without clean_html to make sure the first
+ # break tag is present.
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <p>AAA<br />BBB</p>
+ <p>CCC</p>
+ <ol data-list-type="decimal">
+ <li>DDD<br />EEE</li>
+ <li>FFF</li>
+ </ol>
+ <table>
+ <tr>
+ <td>GGG<br />HHH</td>
+ <td>III<br />JJJ</td>
+ </tr>
+ <tr>
+ <td>KKK</td>
+ <td>LLL</td>
+ </tr>
+ </table>
+ </body></html>
+ ''')
+
+
+def test_lists_with_styles():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'lists_with_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <ol data-list-type="decimal">
+ <li>AAA</li>
+ <li>BBB
+ <ol data-list-type="lower-roman">
+ <li>CCC</li>
+ <li>DDD
+ <ol data-list-type="upper-alpha">
+ <li>EEE
+ <ol data-list-type="lower-alpha">
+ <li>FFF</li>
+ </ol>
+ </li>
+ </ol>
+ </li>
+ </ol>
+ </li>
+ </ol>
+ </body></html>
+ ''')
+
+
+def test_list_to_header():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_to_header.docx',
+ )
+ actual_html = convert(file_path)
+ # It should be noted that list item `GGG` is upper roman in the word
+ # document to show that only top level upper romans get converted.
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>AAA</h2>
+ <ol data-list-type="decimal">
+ <li>BBB</li>
+ </ol>
+ <h2>CCC</h2>
+ <ol data-list-type="decimal">
+ <li>DDD</li>
+ </ol>
+ <h2>EEE</h2>
+ <ol data-list-type="decimal">
+ <li>FFF
+ <ol data-list-type="upper-roman">
+ <li>GGG</li>
+ </ol>
+ </li>
+ </ol>
+ </body></html>
+ ''')
+
+
+def test_has_title():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_title.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '<html><body><p>Text</p></body></html>')
+
+
+def test_upper_alpha_all_bold():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'upper_alpha_all_bold.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, '''
+ <html><body>
+ <h2>AAA</h2>
+ <h2>BBB</h2>
+ <h2>CCC</h2>
+ </body></html>
+ ''')
+
+
+def _converter(*args, **kwargs):
+ # Having a converter that does nothing is the same as if abiword fails to
+ # convert.
+ pass
+
+
+#def test_converter_broken():
+# file_path = 'test.doc'
+# assert_raises(
+# ConversionFailed,
+# lambda: convert(file_path, converter=_converter),
+# )
+
+
+def test_fall_back():
+ raise SkipTest('This test is not yet passing')
+ file_path = 'test.doc'
+
+ def fall_back(*args, **kwargs):
+ return 'success'
+ html = convert(file_path, fall_back=fall_back, converter=_converter)
+ assert html == 'success'
+
+
+#@mock.patch('docx2html.core.read_html_file')
+#@mock.patch('docx2html.core.get_zip_file_handler')
+#def test_html_files(patch_zip_handler, patch_read):
+def test_html_files():
+ raise SkipTest('This test is not yet passing')
+
+ def raise_assertion(*args, **kwargs):
+ raise AssertionError('Should not have called get_zip_file_handler')
+ #patch_zip_handler.side_effect = raise_assertion
+
+ def return_text(*args, **kwargs):
+ return 'test'
+ #patch_read.side_effect = return_text
+
+ # Try with an html file
+ file_path = 'test.html'
+
+ html = convert(file_path)
+ assert html == 'test'
+
+ # Try again with an htm file.
+ file_path = 'test.htm'
+
+ html = convert(file_path)
+ assert html == 'test'
View
1 requirements.txt
@@ -1 +1,2 @@
beautifulsoup4>=4.1.0
+lxml>2.2.0
View
3 run_tests.sh
@@ -0,0 +1,3 @@
+#! /bin/sh
+
+nosetests --verbose --with-doctest --with-coverage --cover-package pydocx $@

0 comments on commit a7dd82b

Please sign in to comment.