/
BeautifulSoup.py
34 lines (24 loc) · 1.03 KB
/
BeautifulSoup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
import bs4
from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruction, SoupStrainer, Tag, __version__ # noqa
def parse_html(markup):
from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(markup, str):
markup = strip_encoding_declarations(markup)
markup = substitute_entites(markup)
else:
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
markup = clean_xml_chars(markup)
from html5_parser.soup import parse
return parse(markup, return_root=False)
def prettify(soup):
ans = soup.prettify()
if isinstance(ans, bytes):
ans = ans.decode('utf-8')
return ans
def BeautifulSoup(markup='', *a, **kw):
return parse_html(markup)
def BeautifulStoneSoup(markup='', *a, **kw):
return bs4.BeautifulSoup(markup, 'xml')