Skip to content

Commit

Permalink
Use 3k version and patch it for python 2.x
Browse files Browse the repository at this point in the history
  • Loading branch information
jedie committed Jun 11, 2012
1 parent 7dd570e commit ddf7a20
Showing 1 changed file with 161 additions and 47 deletions.
208 changes: 161 additions & 47 deletions creole/shared/HTMLParsercompat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
"""A parser for HTML and XHTML.
Original "v2.7.3 final" version from:
http://hg.python.org/cpython/file/70274d53c1dd/Lib/HTMLParser.py
"""
Patched version of the original from:
http://hg.python.org/cpython/file/tip/Lib/html/parser.py
compare:
http://hg.python.org/cpython/file/2.7/Lib/HTMLParser.py
http://hg.python.org/cpython/file/3.2/Lib/html/parser.py
e.g.:
cd /tmp/
wget http://hg.python.org/cpython/raw-file/2.7/Lib/HTMLParser.py
wget http://hg.python.org/cpython/raw-file/3.2/Lib/html/parser.py
meld HTMLParser.py parser.py
Make it compatible with Python 2.x and 3.x
More info see html_parser.py !
"""

# ------------------------------------------------------------------- add start
from __future__ import division, absolute_import, print_function, unicode_literals
from creole.py3compat import PY3
# --------------------------------------------------------------------- add end

"""A parser for HTML and XHTML."""

# This file is based on sgmllib.py, but the API is slightly different.

Expand All @@ -11,7 +32,12 @@
# and CDATA (character data -- only end tags are special).


import markupbase
# --------------------------------------------------------------- changes start
try:
import _markupbase # python 3
except ImportError:
import markupbase as _markupbase # python 2
# --------------------------------------------------------------- changes end
import re

# Regular expressions used for parsing
Expand All @@ -25,16 +51,37 @@
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

# Note:
# 1) the strict attrfind isn't really strict, but we can't make it
# correctly strict without breaking backward compatibility;
# 2) if you change attrfind remember to update locatestarttagend too;
# 3) if you change attrfind and/or locatestarttagend the parser will
# explode, so don't do it.
attrfind = re.compile(
r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Expand All @@ -43,6 +90,7 @@
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
Expand Down Expand Up @@ -72,7 +120,7 @@ def __str__(self):
return result


class HTMLParser(markupbase.ParserBase):
class HTMLParser(_markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
Expand All @@ -94,9 +142,15 @@ class HTMLParser(markupbase.ParserBase):

CDATA_CONTENT_ELEMENTS = ("script", "style")

def __init__(self, strict=True):
"""Initialize and reset this instance.
def __init__(self):
"""Initialize and reset this instance."""
If strict is set to True (the default), errors are raised when invalid
HTML is encountered. If set to False, an attempt is instead made to
continue parsing, making "best guesses" about the intended meaning, in
a fashion similar to what browsers typically do.
"""
self.strict = strict
self.reset()

def reset(self):
Expand All @@ -105,7 +159,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
markupbase.ParserBase.reset(self)
_markupbase.ParserBase.reset(self)

def feed(self, data):
r"""Feed data to the parser.
Expand Down Expand Up @@ -166,7 +220,10 @@ def goahead(self, end):
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
if self.strict:
k = self.parse_declaration(i)
else:
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
Expand All @@ -175,6 +232,8 @@ def goahead(self, end):
if k < 0:
if not end:
break
if self.strict:
self.error("EOF in middle of construct")
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
Expand Down Expand Up @@ -213,7 +272,12 @@ def goahead(self, end):
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
self.error("EOF in middle of entity or char ref")
if self.strict:
self.error("EOF in middle of entity or char ref")
else:
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
Expand Down Expand Up @@ -292,10 +356,12 @@ def parse_starttag(self, i):
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()

self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = attrfind.match(rawdata, k)
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
Expand All @@ -318,6 +384,9 @@ def parse_starttag(self, i):
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
Expand All @@ -333,7 +402,10 @@ def parse_starttag(self, i):
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i)
if self.strict:
m = locatestarttagend.match(rawdata, i)
else:
m = locatestarttagend_tolerant.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
Expand All @@ -346,8 +418,13 @@ def check_for_whole_start_tag(self, i):
# buffer boundary
return -1
# else bogus input
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if self.strict:
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if j > i:
return j
else:
return i + 1
if next == "":
# end of input
return -1
Expand All @@ -356,6 +433,9 @@ def check_for_whole_start_tag(self, i):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if self.strict:
self.updatepos(i, j)
self.error("malformed start tag")
if j > i:
return j
else:
Expand All @@ -375,6 +455,8 @@ def parse_endtag(self, i):
if self.cdata_elem is not None:
self.handle_data(rawdata[i:gtpos])
return gtpos
if self.strict:
self.error("bad end tag: %r" % (rawdata[i:gtpos],))
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch = tagfind_tolerant.match(rawdata, i+2)
if not namematch:
Expand All @@ -398,7 +480,7 @@ def parse_endtag(self, i):
self.handle_data(rawdata[i:gtpos])
return gtpos

self.handle_endtag(elem)
self.handle_endtag(elem.lower())
self.clear_cdata_mode()
return gtpos

Expand Down Expand Up @@ -440,36 +522,68 @@ def handle_pi(self, data):
pass

def unknown_decl(self, data):
pass
if self.strict:
self.error("unknown declaration: %r" % (data,))

# Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s):
if '&' not in s:
return s
def replaceEntities(s):
s = s.groups()[0]
try:
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:], 16)
else:
c = int(s)
return unichr(c)
except ValueError:
return '&#'+s+';'
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
# which is not part of HTML 4
import htmlentitydefs
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
entitydefs[k] = unichr(v)
# -------------------------------------------------------- change start
if PY3:
def replaceEntities(s):
s = s.groups()[0]
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'

return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:], 16)
else:
c = int(s)
return chr(c)
except ValueError:
return '&#'+ s +';'
else:
# Cannot use name2codepoint directly, because HTMLParser
# supports apos, which is not part of HTML 4
import html.entities
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':"'"}
for k, v in html.entities.name2codepoint.items():
entitydefs[k] = chr(v)
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'

return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
replaceEntities, s, flags=re.ASCII)
else:
def replaceEntities(s):
s = s.groups()[0]
try:
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:], 16)
else:
c = int(s)
return unichr(c)
except ValueError:
return '&#'+s+';'
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
# which is not part of HTML 4
import htmlentitydefs
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
entitydefs[k] = unichr(v)
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'

return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
# -------------------------------------------------------- change end

0 comments on commit ddf7a20

Please sign in to comment.