-
Notifications
You must be signed in to change notification settings - Fork 1
/
html2rst.py
182 lines (149 loc) · 4.4 KB
/
html2rst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# -*- coding: UTF-8 -*-
# Copyright 2013-2020 Rumma & Ko Ltd
# License: GNU Affero General Public License v3 (see file COPYING for details)
"""
Convert an :mod:`etgen.html` element to a reStructuredText string.
If `stripped` is `True`, output will be more concise and optimized
for console output, but possibly not valid reStructuredText.
Usage examples:
>>> from etgen.html import E
>>> e = E.p("This is a ", E.b("first"), " test.")
>>> print(html2rst(e, True))
This is a **first** test.
<BLANKLINE>
>>> e = E.p(E.b("This")," is another test.")
>>> print(html2rst(e, True))
**This** is another test.
<BLANKLINE>
>>> e = E.p(E.b("This")," is ",E.em("another")," test.")
>>> print(html2rst(e, True))
**This** is *another* test.
<BLANKLINE>
>>> url = "http://example.com"
>>> e = E.p(E.b("This")," is ",E.a("a link",href=url),".")
>>> print(html2rst(e, True))
**This** is `a link <http://example.com>`__.
<BLANKLINE>
>>> e = E.p("An empty bold text:",E.b(""))
>>> print(html2rst(e, True))
An empty bold text:
<BLANKLINE>
>>> e = E.ul(E.li("First"), E.li("Second"))
>>> print(html2rst(e, True))
<BLANKLINE>
First
Second
<BLANKLINE>
>>> e = E.h1("A header")
>>> print(html2rst(e, True))
========
A header
========
<BLANKLINE>
For images we render the ``alt`` text between brackets:
>>> e = E.img(src="http://example.com/images/1.jpg", alt="1")
>>> print(html2rst(e, True))
[img 1]
If there is no ``alt`` text, render the content of ``src``:
>>> e = E.img(src="http://example.com/images/1.jpg")
>>> print(html2rst(e, True))
[img http://example.com/images/1.jpg]
"""
import rstgen
from etgen import etree
NEWLINE_TAGS = set(['p', 'thead', 'tr', 'li'])
IGNORED_TAGS = set(
['tbody', 'table', 'div', 'span', 'br', 'ul', 'ol', 'html', 'body'])
class UnsupportedHtmlTag(Exception):
pass
def html2rst(e, stripped=False):
"""
Convert an element tree to reStructuredText.
"""
#~ print("20120613 html2odftext()", e.tag, e.text)
rst = ''
if e.tag in ('p', 'li'):
if not stripped:
rst += '\n\n'
elif e.tag in ('ul', 'ol'):
rst += '\n'
elif e.tag == 'br':
if stripped:
rst += '\n'
else:
rst += ' |br| \n'
elif e.tag == 'b':
rst += '**'
elif e.tag == 'em' or e.tag == 'i':
rst += '*'
elif e.tag == 'a':
rst += '`'
if e.text:
rst += e.text
for child in e:
rst += html2rst(child, stripped)
if e.tag in NEWLINE_TAGS:
if stripped:
rst += '\n'
else:
rst += '\n\n'
elif e.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
rst = rstgen.header(int(e.tag[1]), rst.strip()).strip()
if stripped:
rst += '\n'
else:
rst = '\n\n' + rst + '\n\n'
elif e.tag == 'b' or e.tag == 'strong':
if rst == '**':
rst = ''
else:
rst += '**'
elif e.tag == 'em' or e.tag == 'i':
if rst == '*':
rst = ''
else:
rst += '*'
elif e.tag == 'a':
rst += ' <%s>`__' % e.get('href')
elif e.tag == 'img':
text = e.get('alt') or e.get('src')
rst += '[img %s]' % text
elif e.tag in ('td', 'th'):
rst += ' '
else:
if e.tag not in IGNORED_TAGS:
raise UnsupportedHtmlTag(e.tag)
if e.tail:
rst += e.tail
return rst
# def html2rst(e):
# return _html2rst(e).strip()
class RstTable(rstgen.Table):
"""
A table containing elementtree HTML:
.. complextable::
:header:
Code <NEXTCELL> Result <NEXTROW>
>>> from etgen.html import E
>>> headers = [E.p("A ", E.b("formatted"), " header"), "A plain header"]
>>> rows = [[1,2], [3,4]]
>>> print(RstTable(headers).to_rst(rows))
======================== ================
A **formatted** header A plain header
------------------------ ----------------
1 2
3 4
======================== ================
<BLANKLINE>
<NEXTCELL>
======================== ================
A **formatted** header A plain header
------------------------ ----------------
1 2
3 4
======================== ================
"""
def format_value(self, v):
if etree.iselement(v):
return html2rst(v, True).strip()
return super(RstTable, self).format_value(v).strip()