Skip to content

Commit

Permalink
prefer HTML declared encoding over HTTP declared encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Nov 25, 2016
1 parent 2429b4b commit f1d90c4
Showing 1 changed file with 27 additions and 7 deletions.
34 changes: 27 additions & 7 deletions src/lib/html_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from lib.utils import jarr_get, rebuild_url

CHARSET_TAG = b'<meta charset='
FEED_MIMETYPES = ('application/atom+xml', 'application/rss+xml',
'application/rdf+xml', 'application/xml', 'text/xml')

Expand All @@ -27,14 +28,33 @@ def try_get_icon_url(url, *splits):
return None


def _meta_w_charset(elem):
return elem.name == 'meta' and 'charset' in elem.attrs


def _extract_charset(content, strainer):
parsed = BeautifulSoup(content, 'html.parser', parse_only=strainer)
for meta in parsed.find_all(_meta_w_charset):
return meta.attrs['charset']


def _try_encodings(content, *encodings):
for encoding in encodings:
try:
return content.decode(encoding)
except Exception:
pass
return content.decode('utf8', 'ignore')


@lru_cache(maxsize=None)
def get_soup(content, encoding):
stype = 'html.parser'
try:
head_end = content.find(b'</head>')
return BeautifulSoup(content[:head_end].decode(encoding), stype)
except Exception:
return BeautifulSoup(content, stype, parse_only=SoupStrainer('head'))
def get_soup(content, header_encoding='utf8'):
strainer = SoupStrainer('head')
if not isinstance(content, str):
encodings = [_extract_charset(content, strainer), header_encoding] \
if CHARSET_TAG in content else [header_encoding]
content = _try_encodings(content, encodings)
return BeautifulSoup(content, 'html.parser', parse_only=strainer)


def extract_title(response, og_prop='og;title'):
Expand Down

0 comments on commit f1d90c4

Please sign in to comment.