This repository has been archived by the owner on Mar 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
html.py
101 lines (80 loc) · 2.9 KB
/
html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
from bs4 import BeautifulSoup, UnicodeDammit
import lxml.html
import lxml.html.clean
from lxml import etree
def _extract_links(html):
tree = etree.fromstring(html, parser=etree.HTMLParser())
url_index = []
i = 0
for link in tree.findall('.//a'):
url = link.get('href')
if url:
i += 1
index_data = {'title': link.text, 'url': url, 'id': i}
link.tag = 'span'
link.text = '{title} [{id}]'.format(**index_data)
url_index.append(index_data)
html = etree.tostring(tree, encoding="UTF-8", method='html')
html = html.decode('utf-8')
html += '<br/>\n<br/>\n'
for index_data in url_index:
html += '[{id}] {title}: \t {url} <br/>\n'.format(**index_data)
return html
def _create_lxml_html_cleaner():
# Consult http://lxml.de/3.4/api/lxml.html.clean.Cleaner-class.html
cleaner = lxml.html.clean.Cleaner()
# The idea here is to make it explicit to copy/paste a link, as all of our
# links will be available as a text-only index.
# Also remove any tags that would make inline printing bad, like <html>,
# <head> and body.
cleaner.remove_tags = ['a', 'img', 'head', 'html', 'body']
# remove the shady stuff
cleaner.javascript = True
cleaner.scripts = True
cleaner.links = True
cleaner.embedded = True
cleaner.frames = True
# may contain exploits for IE6
cleaner.comments = True
cleaner.processing_instructions = True
cleaner.meta = True
cleaner.forms = True
cleaner.remove_unknown_tags = True
cleaner.annoying_tags = True
# remove <style> so they don't propagate through to the rest of the page
cleaner.style = True
# keep inline styles, as they make the page more readable
cleaner.inline_style = False
return cleaner
lxml_cleaner = _create_lxml_html_cleaner()
def clean_html(html):
dammit = UnicodeDammit(html, is_html=True)
try:
html = dammit.unicode_markup
except ValueError:
if isinstance(html, bytes):
html = html.decode('latin-1')
else:
html = html
html = re.sub(r'^<\?xml\s*[^>]*>', '', html)
# extract all links and move them to an index
extracted_links = _extract_links(html)
# clean the html using the default params
clean = lxml.html.clean.clean_html(extracted_links)
# paranoid: run another pass using our settings
cleanest = lxml_cleaner.clean_html(clean)
if not isinstance(cleanest, str):
cleanest = lxml.html.tostring(cleanest, encoding='UTF-8')
return cleanest
def text_from_html(html):
soup = BeautifulSoup(html, 'lxml')
for node in soup(["script", "style"]):
node.extract()
text = soup.get_text().strip()
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'[ \t\r\f\v]+', ' ', text)
return text
def get_safe_html(doc):
with doc.open() as f:
return clean_html(f.read())