Skip to content
This repository
Browse code

Use 3k version and patch it for python 2.x

  • Loading branch information...
commit ddf7a201b5a6faf81d63cbc7f216a280a40161e3 1 parent 7dd570e
Jens Diemer authored June 11, 2012

Showing 1 changed file with 161 additions and 47 deletions. Show diff stats Hide diff stats

  1. 208  creole/shared/HTMLParsercompat.py
208  creole/shared/HTMLParsercompat.py
... ...
@@ -1,7 +1,28 @@
1  
-"""A parser for HTML and XHTML.
2  
-Original "v2.7.3 final" version from:
3  
-    http://hg.python.org/cpython/file/70274d53c1dd/Lib/HTMLParser.py
4 1
 """
  2
+Patched version of the original from:
  3
+    http://hg.python.org/cpython/file/tip/Lib/html/parser.py
  4
+    
  5
+compare:
  6
+    http://hg.python.org/cpython/file/2.7/Lib/HTMLParser.py
  7
+    http://hg.python.org/cpython/file/3.2/Lib/html/parser.py
  8
+
  9
+e.g.:
  10
+    cd /tmp/
  11
+    wget http://hg.python.org/cpython/raw-file/2.7/Lib/HTMLParser.py
  12
+    wget http://hg.python.org/cpython/raw-file/3.2/Lib/html/parser.py
  13
+    meld HTMLParser.py parser.py
  14
+
  15
+Make it compatible with Python 2.x and 3.x
  16
+    
  17
+More info see html_parser.py !
  18
+"""
  19
+
  20
+# ------------------------------------------------------------------- add start
  21
+from __future__ import division, absolute_import, print_function, unicode_literals
  22
+from creole.py3compat import PY3
  23
+# --------------------------------------------------------------------- add end
  24
+
  25
+"""A parser for HTML and XHTML."""
5 26
 
6 27
 # This file is based on sgmllib.py, but the API is slightly different.
7 28
 
@@ -11,7 +32,12 @@
11 32
 # and CDATA (character data -- only end tags are special).
12 33
 
13 34
 
14  
-import markupbase
  35
+# --------------------------------------------------------------- changes start
  36
+try:
  37
+    import _markupbase # python 3
  38
+except ImportError:
  39
+    import markupbase as _markupbase # python 2
  40
+# --------------------------------------------------------------- changes end
15 41
 import re
16 42
 
17 43
 # Regular expressions used for parsing
@@ -25,17 +51,38 @@
25 51
 starttagopen = re.compile('<[a-zA-Z]')
26 52
 piclose = re.compile('>')
27 53
 commentclose = re.compile(r'--\s*>')
28  
-tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  54
+tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
29 55
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
30 56
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
31 57
 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
32  
-
  58
+# Note:
  59
+#  1) the strict attrfind isn't really strict, but we can't make it
  60
+#     correctly strict without breaking backward compatibility;
  61
+#  2) if you change attrfind remember to update locatestarttagend too;
  62
+#  3) if you change attrfind and/or locatestarttagend the parser will
  63
+#     explode, so don't do it.
33 64
 attrfind = re.compile(
34  
-    r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  65
+    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  66
+    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
  67
+attrfind_tolerant = re.compile(
  68
+    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
35 69
     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
36  
-
37 70
 locatestarttagend = re.compile(r"""
38 71
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  72
+  (?:\s+                             # whitespace before attribute name
  73
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  74
+      (?:\s*=\s*                     # value indicator
  75
+        (?:'[^']*'                   # LITA-enclosed value
  76
+          |\"[^\"]*\"                # LIT-enclosed value
  77
+          |[^'\">\s]+                # bare value
  78
+         )
  79
+       )?
  80
+     )
  81
+   )*
  82
+  \s*                                # trailing whitespace
  83
+""", re.VERBOSE)
  84
+locatestarttagend_tolerant = re.compile(r"""
  85
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
39 86
   (?:[\s/]*                          # optional whitespace before attribute name
40 87
     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
41 88
       (?:\s*=+\s*                    # value indicator
@@ -43,6 +90,7 @@
43 90
           |"[^"]*"                   # LIT-enclosed value
44 91
           |(?!['"])[^>\s]*           # bare value
45 92
          )
  93
+         (?:\s*,)*                   # possibly followed by a comma
46 94
        )?(?:\s|/(?!>))*
47 95
      )*
48 96
    )?
@@ -72,7 +120,7 @@ def __str__(self):
72 120
         return result
73 121
 
74 122
 
75  
-class HTMLParser(markupbase.ParserBase):
  123
+class HTMLParser(_markupbase.ParserBase):
76 124
     """Find tags and other markup and call handler functions.
77 125
 
78 126
     Usage:
@@ -94,9 +142,15 @@ class HTMLParser(markupbase.ParserBase):
94 142
 
95 143
     CDATA_CONTENT_ELEMENTS = ("script", "style")
96 144
 
  145
+    def __init__(self, strict=True):
  146
+        """Initialize and reset this instance.
97 147
 
98  
-    def __init__(self):
99  
-        """Initialize and reset this instance."""
  148
+        If strict is set to True (the default), errors are raised when invalid
  149
+        HTML is encountered.  If set to False, an attempt is instead made to
  150
+        continue parsing, making "best guesses" about the intended meaning, in
  151
+        a fashion similar to what browsers typically do.
  152
+        """
  153
+        self.strict = strict
100 154
         self.reset()
101 155
 
102 156
     def reset(self):
@@ -105,7 +159,7 @@ def reset(self):
105 159
         self.lasttag = '???'
106 160
         self.interesting = interesting_normal
107 161
         self.cdata_elem = None
108  
-        markupbase.ParserBase.reset(self)
  162
+        _markupbase.ParserBase.reset(self)
109 163
 
110 164
     def feed(self, data):
111 165
         r"""Feed data to the parser.
@@ -166,7 +220,10 @@ def goahead(self, end):
166 220
                 elif startswith("<?", i):
167 221
                     k = self.parse_pi(i)
168 222
                 elif startswith("<!", i):
169  
-                    k = self.parse_html_declaration(i)
  223
+                    if self.strict:
  224
+                        k = self.parse_declaration(i)
  225
+                    else:
  226
+                        k = self.parse_html_declaration(i)
170 227
                 elif (i + 1) < n:
171 228
                     self.handle_data("<")
172 229
                     k = i + 1
@@ -175,6 +232,8 @@ def goahead(self, end):
175 232
                 if k < 0:
176 233
                     if not end:
177 234
                         break
  235
+                    if self.strict:
  236
+                        self.error("EOF in middle of construct")
178 237
                     k = rawdata.find('>', i + 1)
179 238
                     if k < 0:
180 239
                         k = rawdata.find('<', i + 1)
@@ -213,7 +272,12 @@ def goahead(self, end):
213 272
                 if match:
214 273
                     # match.group() will contain at least 2 chars
215 274
                     if end and match.group() == rawdata[i:]:
216  
-                        self.error("EOF in middle of entity or char ref")
  275
+                        if self.strict:
  276
+                            self.error("EOF in middle of entity or char ref")
  277
+                        else:
  278
+                            if k <= i:
  279
+                                k = n
  280
+                            i = self.updatepos(i, i + 1)
217 281
                     # incomplete
218 282
                     break
219 283
                 elif (i + 1) < n:
@@ -292,10 +356,12 @@ def parse_starttag(self, i):
292 356
         match = tagfind.match(rawdata, i+1)
293 357
         assert match, 'unexpected call to parse_starttag()'
294 358
         k = match.end()
295  
-        self.lasttag = tag = rawdata[i+1:k].lower()
296  
-
  359
+        self.lasttag = tag = match.group(1).lower()
297 360
         while k < endpos:
298  
-            m = attrfind.match(rawdata, k)
  361
+            if self.strict:
  362
+                m = attrfind.match(rawdata, k)
  363
+            else:
  364
+                m = attrfind_tolerant.match(rawdata, k)
299 365
             if not m:
300 366
                 break
301 367
             attrname, rest, attrvalue = m.group(1, 2, 3)
@@ -318,6 +384,9 @@ def parse_starttag(self, i):
318 384
                          - self.__starttag_text.rfind("\n")
319 385
             else:
320 386
                 offset = offset + len(self.__starttag_text)
  387
+            if self.strict:
  388
+                self.error("junk characters in start tag: %r"
  389
+                           % (rawdata[k:endpos][:20],))
321 390
             self.handle_data(rawdata[i:endpos])
322 391
             return endpos
323 392
         if end.endswith('/>'):
@@ -333,7 +402,10 @@ def parse_starttag(self, i):
333 402
     # or -1 if incomplete.
334 403
     def check_for_whole_start_tag(self, i):
335 404
         rawdata = self.rawdata
336  
-        m = locatestarttagend.match(rawdata, i)
  405
+        if self.strict:
  406
+            m = locatestarttagend.match(rawdata, i)
  407
+        else:
  408
+            m = locatestarttagend_tolerant.match(rawdata, i)
337 409
         if m:
338 410
             j = m.end()
339 411
             next = rawdata[j:j+1]
@@ -346,8 +418,13 @@ def check_for_whole_start_tag(self, i):
346 418
                     # buffer boundary
347 419
                     return -1
348 420
                 # else bogus input
349  
-                self.updatepos(i, j + 1)
350  
-                self.error("malformed empty start tag")
  421
+                if self.strict:
  422
+                    self.updatepos(i, j + 1)
  423
+                    self.error("malformed empty start tag")
  424
+                if j > i:
  425
+                    return j
  426
+                else:
  427
+                    return i + 1
351 428
             if next == "":
352 429
                 # end of input
353 430
                 return -1
@@ -356,6 +433,9 @@ def check_for_whole_start_tag(self, i):
356 433
                 # end of input in or before attribute value, or we have the
357 434
                 # '/' from a '/>' ending
358 435
                 return -1
  436
+            if self.strict:
  437
+                self.updatepos(i, j)
  438
+                self.error("malformed start tag")
359 439
             if j > i:
360 440
                 return j
361 441
             else:
@@ -375,6 +455,8 @@ def parse_endtag(self, i):
375 455
             if self.cdata_elem is not None:
376 456
                 self.handle_data(rawdata[i:gtpos])
377 457
                 return gtpos
  458
+            if self.strict:
  459
+                self.error("bad end tag: %r" % (rawdata[i:gtpos],))
378 460
             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
379 461
             namematch = tagfind_tolerant.match(rawdata, i+2)
380 462
             if not namematch:
@@ -398,7 +480,7 @@ def parse_endtag(self, i):
398 480
                 self.handle_data(rawdata[i:gtpos])
399 481
                 return gtpos
400 482
 
401  
-        self.handle_endtag(elem)
  483
+        self.handle_endtag(elem.lower())
402 484
         self.clear_cdata_mode()
403 485
         return gtpos
404 486
 
@@ -440,36 +522,68 @@ def handle_pi(self, data):
440 522
         pass
441 523
 
442 524
     def unknown_decl(self, data):
443  
-        pass
  525
+        if self.strict:
  526
+            self.error("unknown declaration: %r" % (data,))
444 527
 
445 528
     # Internal -- helper to remove special character quoting
446 529
     entitydefs = None
447 530
     def unescape(self, s):
448 531
         if '&' not in s:
449 532
             return s
450  
-        def replaceEntities(s):
451  
-            s = s.groups()[0]
452  
-            try:
453  
-                if s[0] == "#":
454  
-                    s = s[1:]
455  
-                    if s[0] in ['x','X']:
456  
-                        c = int(s[1:], 16)
457  
-                    else:
458  
-                        c = int(s)
459  
-                    return unichr(c)
460  
-            except ValueError:
461  
-                return '&#'+s+';'
462  
-            else:
463  
-                # Cannot use name2codepoint directly, because HTMLParser supports apos,
464  
-                # which is not part of HTML 4
465  
-                import htmlentitydefs
466  
-                if HTMLParser.entitydefs is None:
467  
-                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
468  
-                    for k, v in htmlentitydefs.name2codepoint.iteritems():
469  
-                        entitydefs[k] = unichr(v)
  533
+        # -------------------------------------------------------- change start
  534
+        if PY3:
  535
+            def replaceEntities(s):
  536
+                s = s.groups()[0]
470 537
                 try:
471  
-                    return self.entitydefs[s]
472  
-                except KeyError:
473  
-                    return '&'+s+';'
474  
-
475  
-        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
  538
+                    if s[0] == "#":
  539
+                        s = s[1:]
  540
+                        if s[0] in ['x','X']:
  541
+                            c = int(s[1:], 16)
  542
+                        else:
  543
+                            c = int(s)
  544
+                        return chr(c)
  545
+                except ValueError:
  546
+                    return '&#'+ s +';'
  547
+                else:
  548
+                    # Cannot use name2codepoint directly, because HTMLParser
  549
+                    # supports apos, which is not part of HTML 4
  550
+                    import html.entities
  551
+                    if HTMLParser.entitydefs is None:
  552
+                        entitydefs = HTMLParser.entitydefs = {'apos':"'"}
  553
+                        for k, v in html.entities.name2codepoint.items():
  554
+                            entitydefs[k] = chr(v)
  555
+                    try:
  556
+                        return self.entitydefs[s]
  557
+                    except KeyError:
  558
+                        return '&'+s+';'
  559
+    
  560
+            return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
  561
+                          replaceEntities, s, flags=re.ASCII)
  562
+        else:
  563
+            def replaceEntities(s):
  564
+                s = s.groups()[0]
  565
+                try:
  566
+                    if s[0] == "#":
  567
+                        s = s[1:]
  568
+                        if s[0] in ['x','X']:
  569
+                            c = int(s[1:], 16)
  570
+                        else:
  571
+                            c = int(s)
  572
+                        return unichr(c)
  573
+                except ValueError:
  574
+                    return '&#'+s+';'
  575
+                else:
  576
+                    # Cannot use name2codepoint directly, because HTMLParser supports apos,
  577
+                    # which is not part of HTML 4
  578
+                    import htmlentitydefs
  579
+                    if HTMLParser.entitydefs is None:
  580
+                        entitydefs = HTMLParser.entitydefs = {'apos':"'"}
  581
+                        for k, v in htmlentitydefs.name2codepoint.iteritems():
  582
+                            entitydefs[k] = unichr(v)
  583
+                    try:
  584
+                        return self.entitydefs[s]
  585
+                    except KeyError:
  586
+                        return '&'+s+';'
  587
+    
  588
+            return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
  589
+        # -------------------------------------------------------- change end        

0 notes on commit ddf7a20

Please sign in to comment.
Something went wrong with that request. Please try again.