/
encodings.py
279 lines (252 loc) · 11.9 KB
/
encodings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Character encoding routines
# Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from __future__ import absolute_import, unicode_literals
import cgi
import codecs
import collections
import re
try:
import chardet
except ImportError:
chardet = None
lazy_chardet_encoding = None
else:
def lazy_chardet_encoding():
chardet_encoding = chardet.detect(data)['encoding']
if not chardet_encoding:
chardet_encoding = ''
if isinstance(chardet_encoding, bytes_):
chardet_encoding = chardet_encoding.encode('ascii', 'ignore')
return chardet_encoding
from .exceptions import (
CharacterEncodingOverride, CharacterEncodingUnknown, NonXMLContentType,
)
bytes_ = type(b'')
unicode_ = type('')
# Each marker represents some of the characters of the opening XML
# processing instruction ('<?xm') in the specified encoding.
EBCDIC_MARKER = b'\x4C\x6F\xA7\x94'
UTF16BE_MARKER = b'\x00\x3C\x00\x3F'
UTF16LE_MARKER = b'\x3C\x00\x3F\x00'
UTF32BE_MARKER = b'\x00\x00\x00\x3C'
UTF32LE_MARKER = b'\x3C\x00\x00\x00'
ZERO_BYTES = '\x00\x00'
# Match the opening XML declaration.
# Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
# Capture the value of the XML processing instruction's encoding attribute.
# Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_PI_ENCODING = re.compile(b'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
def convert_to_utf8(http_headers, data, result):
'''Detect and convert the character encoding to UTF-8.
http_headers is a dictionary
data is a raw string (not Unicode)'''
# This is so much trickier than it sounds, it's not even funny.
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
# is application/xml, application/*+xml,
# application/xml-external-parsed-entity, or application/xml-dtd,
# the encoding given in the charset parameter of the HTTP Content-Type
# takes precedence over the encoding given in the XML prefix within the
# document, and defaults to 'utf-8' if neither are specified. But, if
# the HTTP Content-Type is text/xml, text/*+xml, or
# text/xml-external-parsed-entity, the encoding given in the XML prefix
# within the document is ALWAYS IGNORED and only the encoding given in
# the charset parameter of the HTTP Content-Type header should be
# respected, and it defaults to 'us-ascii' if not specified.
# Furthermore, discussion on the atom-syntax mailing list with the
# author of RFC 3023 leads me to the conclusion that any document
# served with a Content-Type of text/* and no charset parameter
# must be treated as us-ascii. (We now do this.) And also that it
# must always be flagged as non-well-formed. (We now do this too.)
# If Content-Type is unspecified (input was local file or non-HTTP source)
# or unrecognized (server just got it totally wrong), then go by the
# encoding given in the XML prefix of the document and default to
# 'iso-8859-1' as per the HTTP specification (RFC 2616).
# Then, assuming we didn't find a character encoding in the HTTP headers
# (and the HTTP Content-type allowed us to look in the body), we need
# to sniff the first few bytes of the XML data and try to determine
# whether the encoding is ASCII-compatible. Section F of the XML
# specification shows the way here:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
# If the sniffed encoding is not ASCII-compatible, we need to make it
# ASCII compatible so that we can sniff further into the XML declaration
# to find the encoding attribute, which will tell us the true encoding.
# Of course, none of this guarantees that we will be able to parse the
# feed in the declared character encoding (assuming it was declared
# correctly, which many are not). iconv_codec can help a lot;
# you should definitely install it if you can.
# http://cjkpython.i18n.org/
bom_encoding = ''
xml_encoding = ''
rfc3023_encoding = ''
# Look at the first few bytes of the document to guess what
# its encoding may be. We only need to decode enough of the
# document that we can use an ASCII-compatible regular
# expression to search for an XML encoding declaration.
# The heuristic follows the XML specification, section F:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
# Check for BOMs first.
if data[:4] == codecs.BOM_UTF32_BE:
bom_encoding = 'utf-32be'
data = data[4:]
elif data[:4] == codecs.BOM_UTF32_LE:
bom_encoding = 'utf-32le'
data = data[4:]
elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16be'
data = data[2:]
elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16le'
data = data[2:]
elif data[:3] == codecs.BOM_UTF8:
bom_encoding = 'utf-8'
data = data[3:]
# Check for the characters '<?xm' in several encodings.
elif data[:4] == EBCDIC_MARKER:
bom_encoding = 'cp037'
elif data[:4] == UTF16BE_MARKER:
bom_encoding = 'utf-16be'
elif data[:4] == UTF16LE_MARKER:
bom_encoding = 'utf-16le'
elif data[:4] == UTF32BE_MARKER:
bom_encoding = 'utf-32be'
elif data[:4] == UTF32LE_MARKER:
bom_encoding = 'utf-32le'
tempdata = data
try:
if bom_encoding:
tempdata = data.decode(bom_encoding).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# feedparser recognizes UTF-32 encodings that aren't
# available in Python 2.4 and 2.5, so it's possible to
# encounter a LookupError during decoding.
xml_encoding_match = None
else:
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
if xml_encoding_match:
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
# Normalize the xml_encoding if necessary.
if bom_encoding and (xml_encoding in (
'u16', 'utf-16', 'utf16', 'utf_16',
'u32', 'utf-32', 'utf32', 'utf_32',
'iso-10646-ucs-2', 'iso-10646-ucs-4',
'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
)):
xml_encoding = bom_encoding
# Find the HTTP Content-Type and, hopefully, a character
# encoding provided by the server. The Content-Type is used
# to choose the "correct" encoding among the BOM encoding,
# XML declaration encoding, and HTTP encoding, following the
# heuristic defined in RFC 3023.
http_content_type = http_headers.get('content-type') or ''
http_content_type, params = cgi.parse_header(http_content_type)
http_encoding = params.get('charset', '').replace("'", "")
if isinstance(http_encoding, bytes_):
http_encoding = http_encoding.decode('utf-8', 'ignore')
acceptable_content_type = 0
application_content_types = ('application/xml', 'application/xml-dtd',
'application/xml-external-parsed-entity')
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
if (http_content_type in application_content_types) or \
(http_content_type.startswith('application/') and
http_content_type.endswith('+xml')):
acceptable_content_type = 1
rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
elif (http_content_type in text_content_types) or \
(http_content_type.startswith('text/') and
http_content_type.endswith('+xml')):
acceptable_content_type = 1
rfc3023_encoding = http_encoding or 'us-ascii'
elif http_content_type.startswith('text/'):
rfc3023_encoding = http_encoding or 'us-ascii'
elif http_headers and 'content-type' not in http_headers:
rfc3023_encoding = xml_encoding or 'iso-8859-1'
else:
rfc3023_encoding = xml_encoding or 'utf-8'
# gb18030 is a superset of gb2312, so always replace gb2312
# with gb18030 for greater compatibility.
if rfc3023_encoding.lower() == 'gb2312':
rfc3023_encoding = 'gb18030'
if xml_encoding.lower() == 'gb2312':
xml_encoding = 'gb18030'
# there are four encodings to keep track of:
# - http_encoding is the encoding declared in the Content-Type HTTP header
# - xml_encoding is the encoding declared in the <?xml declaration
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
error = None
if http_headers and (not acceptable_content_type):
if 'content-type' in http_headers:
msg = '%s is not an XML media type' % http_headers['content-type']
else:
msg = 'no Content-type specified'
error = NonXMLContentType(msg)
# determine character encoding
known_encoding = 0
tried_encodings = []
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'):
if isinstance(proposed_encoding, collections.Callable):
proposed_encoding = proposed_encoding()
if not proposed_encoding:
continue
if proposed_encoding in tried_encodings:
continue
tried_encodings.append(proposed_encoding)
try:
data = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError):
pass
else:
known_encoding = 1
# Update the encoding in the opening XML processing instruction.
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
if RE_XML_DECLARATION.search(data):
data = RE_XML_DECLARATION.sub(new_declaration, data)
else:
data = new_declaration + '\n' + data
data = data.encode('utf-8')
break
# if still no luck, give up
if not known_encoding:
error = CharacterEncodingUnknown(
'document encoding unknown, I tried ' +
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
(rfc3023_encoding, xml_encoding))
rfc3023_encoding = ''
elif proposed_encoding != rfc3023_encoding:
error = CharacterEncodingOverride(
'document declared as %s, but parsed as %s' %
(rfc3023_encoding, proposed_encoding))
rfc3023_encoding = proposed_encoding
result['encoding'] = rfc3023_encoding
if error:
result['bozo'] = True
result['bozo_exception'] = error
return data