Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

refs #3: got the simple test passing

  • Loading branch information...
commit bdf3c796909bdc8ed74e167fd84516aabb7bfa94 1 parent d4012ea
@jlward authored
Showing with 13 additions and 24 deletions.
  1. +13 −5 pydocx/DocxParser.py
  2. +0 −19 pydocx/tests/test_docx.py
View
18 pydocx/DocxParser.py
@@ -191,9 +191,16 @@ def parse(self, el):
'tbl' in tmp_d and
el.parent_list[tmp_d['tbl']] not in self.tables_seen):
self.ignore_current = True
- self.tables_seen.append(el.parent_list[tmp_d['tbl']])
- tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
+ tbl = el.parent_list[tmp_d['tbl']]
+ self.tables_seen.append(tbl)
+ tmpout = self.table(self.parse(tbl))
self.ignore_current = False
+
+ # Need to keep track of visited trs and tcs
+ self.visited.extend(
+ e for e in el_iter(tbl)
+ if e.tag in ['tr', 'tc']
+ )
return tmpout
for child in el:
@@ -202,14 +209,15 @@ def parse(self, el):
if el.tag == 'br' and el.attrib.get('type') == 'page':
#TODO figure out what parsed is getting overwritten
return self.page_break()
- # add it to the list so we don't repeat!
+ # Add it to the list so we don't repeat!
if el.tag == 'ilvl' and el not in self.visited:
self.in_list = True
self.visited.append(el)
## This starts the returns
- elif el.tag == 'tr':
+ # Do not do the tr or tc a second time
+ elif el.tag == 'tr' and el not in self.visited:
return self.table_row(parsed)
- elif el.tag == 'tc':
+ elif el.tag == 'tc' and el not in self.visited:
self.elements.append(el)
return self.table_cell(parsed)
if el.tag == 'r' and el not in self.elements:
View
19 pydocx/tests/test_docx.py
@@ -76,25 +76,6 @@ def test_extract_html():
</tr>
</table>
</body></html>
-
- <html><body>
- <p>Simple text</p>
- <ol data-list-type="decimal">
- <li>one</li>
- <li>two</li>
- <li>three</li>
- </ol>
- <table>
- <tr>
- <td>Cell1</td>
- <td>Cell2</td>
- </tr>
- <tr>
- <td>Cell3</td>
- <td>cell4</td>
- </tr>
- </table>
- <tr><td></td><td></td></tr><tr><td></td><td></td></tr></body></html>
''')
Please sign in to comment.
Something went wrong with that request. Please try again.