Skip to content

Commit

Permalink
Fix an error when converting invalid html with empty tags to text
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Oct 30, 2018
1 parent 0626088 commit 8d70476
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions src/calibre/utils/html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def name2cp(k):
return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])


unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
Expand Down Expand Up @@ -96,6 +97,7 @@ def replaceEntities(s):
else:
return entityref(s)


r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")


Expand Down Expand Up @@ -150,7 +152,7 @@ def optwrap(text):


def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
if tag and tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10):
Expand Down Expand Up @@ -364,7 +366,7 @@ def o(self, data, puredata=0, force=0):

if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
data = re.sub(r'\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
Expand Down Expand Up @@ -435,6 +437,7 @@ def html2text_file(html, out=wrapwrite, baseurl=''):
def html2text(html, baseurl=''):
return optwrap(html2text_file(html, None, baseurl))


if __name__ == "__main__":
baseurl = ''
if sys.argv[1:]:
Expand All @@ -461,4 +464,3 @@ def html2text(html, baseurl=''):
else:
data = sys.stdin.read().decode('utf8')
wrapwrite(html2text(data, baseurl))

0 comments on commit 8d70476

Please sign in to comment.