Browse files

Merge pull request #2 from jlward/issue_2

Support python 2.6
  • Loading branch information...
2 parents 73b2116 + 82ae6ef commit 6a624e5ff5272096efcbb80e7aacc2aaaabfe26b @jlward committed Mar 22, 2013
Showing with 104 additions and 53 deletions.
  1. +1 −0 .travis.yml
  2. +97 −50 pydocx/DocxParser.py
  3. +6 −3 pydocx/parsers/Docx2Html.py
View
1 .travis.yml
@@ -1,5 +1,6 @@
language: python
python:
+ - "2.6"
- "2.7"
script: python main.py
install:
View
147 pydocx/DocxParser.py
@@ -2,75 +2,100 @@
import zipfile
import logging
import xml.etree.ElementTree as ElementTree
-from xml.etree.ElementTree import Element
+from xml.etree.ElementTree import _ElementInterface
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("NewParser")
+
def remove_namespaces(document):
root = ElementTree.fromstring(document)
- for child in root.iter():
+ for child in el_iter(root):
child.tag = child.tag.split("}")[1]
- child.attrib = {k.split("}")[1]:v for k,v in child.attrib.items()}
+ child.attrib = dict(
+ (k.split("}")[1], v)
+ for k, v in child.attrib.items()
+ )
return ElementTree.tostring(root)
# Add some helper functions to Element to make it slightly more readable
+
def has_child(self, tag):
return True if self.find(tag) is not None else False
+
def has_child_all(self, tag):
return True if self.find('.//' + tag) is not None else False
+
def find_all(self, tag):
return self.find('.//' + tag)
+
def findall_all(self, tag):
return self.findall('.//' + tag)
-setattr(Element, 'has_child', has_child)
-setattr(Element, 'has_child_all', has_child_all)
-setattr(Element, 'find_all', find_all)
-setattr(Element, 'findall_all', findall_all)
-setattr(Element, 'parent', None)
-setattr(Element, 'parent_list', [])
+
+def el_iter(el):
+ try:
+ return el.iter()
+ except AttributeError:
+ return el.findall('.//*')
+
+
+setattr(_ElementInterface, 'has_child', has_child)
+setattr(_ElementInterface, 'has_child_all', has_child_all)
+setattr(_ElementInterface, 'find_all', find_all)
+setattr(_ElementInterface, 'findall_all', findall_all)
+setattr(_ElementInterface, 'parent', None)
+setattr(_ElementInterface, 'parent_list', [])
# End helpers
+
class DocxParser:
__metaclass__ = ABCMeta
def __init__(self, path):
self._parsed = ''
self.in_list = False
- document = ''
- with zipfile.ZipFile(path) as f:
+ f = zipfile.ZipFile(path)
+ try:
self.document_text = f.read('word/document.xml')
try:
self.numbering_text = f.read('word/numbering.xml')
- except:
+ except zipfile.BadZipfile:
pass
try:
- self.comment_text = f.read('word/comments.xml')
- except:
+ self.comment_text = f.read('word/comments.xml')
+ except zipfile.BadZipfile:
pass
+ finally:
+ f.close()
+
+ self.root = ElementTree.fromstring(
+ remove_namespaces(self.document_text),
+ )
- self.root = ElementTree.fromstring(remove_namespaces(self.document_text))
def add_parent(el):
for child in el.getchildren():
setattr(child, 'parent', el)
add_parent(child)
add_parent(self.root)
- def create_parent_list(el, tmp = []):
+
+ def create_parent_list(el, tmp=None):
+ if tmp is None:
+ tmp = []
for child in el:
tmp.append(el)
tmp = create_parent_list(child, tmp)
el.parent_list = tmp[:]
try:
tmp.pop()
except:
- tmp=[]
+ tmp = []
return tmp
create_parent_list(self.root)
@@ -82,7 +107,9 @@ def create_parent_list(el, tmp = []):
self.tables_seen = []
self.visited = []
try:
- self.numbering_root = ElementTree.fromstring(remove_namespaces(self.numbering_text))
+ self.numbering_root = ElementTree.fromstring(
+ remove_namespaces(self.numbering_text),
+ )
except:
pass
self.parse_begin(self.root)
@@ -107,12 +134,21 @@ def parse_lists(self, el):
for i, el in enumerate(p_list):
if not list_started and el.has_child_all('ilvl'):
list_started = True
- list_type = self.get_list_style(el.find_all('numId').attrib['val'])
+ list_type = self.get_list_style(
+ el.find_all('numId').attrib['val'],
+ )
list_chunks.append(p_list[index_start:index_end])
index_start = i
index_end = i+1
- elif list_started and el.has_child_all('ilvl') and not list_type == self.get_list_style(el.find_all('numId').attrib['val']):
- list_type = self.get_list_style(el.find_all('numId').attrib['val'])
+ elif (
+ list_started and
+ el.has_child_all('ilvl') and
+ not list_type == self.get_list_style(
+ el.find_all('numId').attrib['val']
+ )):
+ list_type = self.get_list_style(
+ el.find_all('numId').attrib['val'],
+ )
list_started = True
list_chunks.append(p_list[index_start:index_end])
index_start = i
@@ -130,7 +166,9 @@ def parse_lists(self, el):
for el in chunk:
chunk_parsed += self.parse(el)
if chunk[0].has_child_all('ilvl'):
- lst_style = self.get_list_style(chunk[0].find_all('numId').attrib['val'])
+ lst_style = self.get_list_style(
+ chunk[0].find_all('numId').attrib['val'],
+ )
if lst_style['val'] == 'bullet':
parsed += self.unordered_list(chunk_parsed)
else:
@@ -145,8 +183,13 @@ def parse_lists(self, el):
def parse(self, el):
parsed = ''
if not self.ignore_current:
- tmp_d = {tmpel.tag:i for i, tmpel in enumerate(el.parent_list)}
- if 'tbl' in tmp_d and el.parent_list[tmp_d['tbl']] not in self.tables_seen:
+ tmp_d = dict(
+ (tmpel.tag, i)
+ for i, tmpel in enumerate(el.parent_list)
+ )
+ if (
+ 'tbl' in tmp_d and
+ el.parent_list[tmp_d['tbl']] not in self.tables_seen):
self.ignore_current = True
self.tables_seen.append(el.parent_list[tmp_d['tbl']])
tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
@@ -156,14 +199,11 @@ def parse(self, el):
for child in el:
parsed += self.parse(child)
-
- if el.tag == 'commentReference':
- id = el.attrib['id']
- #TODO div for comment reference and styling
if el.tag == 'br' and el.attrib['type'] == 'page':
#TODO figure out what parsed is getting overwritten
return self.page_break()
- if el.tag == 'ilvl' and el not in self.visited: #add it to the list so we don't repeat!
+ # add it to the list so we don't repeat!
+ if el.tag == 'ilvl' and el not in self.visited:
self.in_list = True
self.visited.append(el)
## This starts the returns
@@ -187,14 +227,14 @@ def parse_p(self, el, text):
if self.in_list:
self.in_list = False
parsed = self.list_element(parsed)
- elif not el.has_child_all('t') and 'tbl' not in [i.tag for i in el.parent_list]:
+ elif (
+ not el.has_child_all('t') and
+ 'tbl' not in [i.tag for i in el.parent_list]):
parsed = self.linebreak()
elif el.parent not in self.elements:
parsed = self.paragraph(parsed)
return parsed
-
-
def parse_r(self, el):
is_deleted = False
text = None
@@ -215,7 +255,7 @@ def parse_r(self, el):
fns.append(self.underline)
for fn in fns:
text = fn(text)
- ppr = el.parent.find ('pPr')
+ ppr = el.parent.find('pPr')
if ppr is not None:
jc = ppr.find('jc')
if jc is not None:
@@ -242,33 +282,40 @@ def parse_r(self, el):
firstLine = str(firstLine)
text = self.indent(text, right, left, firstLine)
if is_deleted:
- text = self.deletion(text,'','')
+ text = self.deletion(text, '', '')
return text
else:
return ''
def get_list_style(self, numval):
ids = self.numbering_root.findall_all('num')
- for id in ids:
- if id.attrib['numId'] == numval:
- abstractid=id.find('abstractNumId')
- abstractid=abstractid.attrib['val']
- style_information=self.numbering_root.findall_all('abstractNum')
+ for _id in ids:
+ if _id.attrib['numId'] == numval:
+ abstractid = _id.find('abstractNumId')
+ abstractid = abstractid.attrib['val']
+ style_information = self.numbering_root.findall_all(
+ 'abstractNum',
+ )
for info in style_information:
if info.attrib['abstractNumId'] == abstractid:
- for i in info.iter():
+ for i in el_iter(info):
if i.find('numFmt') is not None:
return i.find('numFmt').attrib
def get_comments(self, doc_id):
if self.comment_store is None:
# TODO throw appropriate error
- comment_root = ElementTree.fromstring(remove_namespaces(self.comment_text))
+ comment_root = ElementTree.fromstring(
+ remove_namespaces(self.comment_text),
+ )
ids_and_info = {}
- information = {}
ids = comment_root.findall_all('comment')
- for id in ids:
- ids_and_info[id.attrib['id']] = {"author": id.attrib['author'],"date": id.attrib['date'],"text" : id.findall_all('t')[0].text}
+ for _id in ids:
+ ids_and_info[_id.attrib['id']] = {
+ "author": _id.attrib['author'],
+ "date": _id.attrib['date'],
+ "text": _id.findall_all('t')[0].text,
+ }
self.comment_store = ids_and_info
return self.comment_store[doc_id]
@@ -305,7 +352,7 @@ def italics(self, text):
return text
@abstractmethod
- def underline(self,text):
+ def underline(self, text):
return text
@abstractmethod
@@ -321,7 +368,7 @@ def unordered_list(self, text):
return text
@abstractmethod
- def list_element(self,text):
+ def list_element(self, text):
return text
@abstractmethod
@@ -341,15 +388,15 @@ def page_break(self):
return True
@abstractmethod
- def right_justify(self,text):
+ def right_justify(self, text):
return text
@abstractmethod
- def center_justify(self,text):
+ def center_justify(self, text):
return text
@abstractmethod
- def indent(self, text, left = None, right = None, firstLine = None):
+ def indent(self, text, left=None, right=None, firstLine=None):
return text
- #TODO JUSTIFIED JUSTIFIED TEXT
+ #TODO JUSTIFIED JUSTIFIED TEXT
View
9 pydocx/parsers/Docx2Html.py
@@ -14,8 +14,8 @@ def parsed(self):
'<html><head><style>.insert{{color:red}}.delete'
'{{color:red; text-decoration:line-through}}.center'
'{{text-align:center}}.right{{text-align:right}}'
- '</style></head><body>{}</body></html>'
- ).format(self._parsed)
+ '</style></head><body>{content}</body></html>'
+ ).format(content=self._parsed)
def escape(self, text):
return xml.sax.saxutils.quoteattr(text)[1:-1]
@@ -80,4 +80,7 @@ def right_justify(self, text):
return "<div class = 'right'>" + text + '</div>'
def indent(self, text, right, left, firstLine):
- return "<div style = 'margin-left:{}pt'>{}</div>".format(left, text)
+ return "<div style = 'margin-left:{left}pt'>{text}</div>".format(
+ left=left,
+ text=text,
+ )

0 comments on commit 6a624e5

Please sign in to comment.