/
chardet.py
195 lines (161 loc) · 5.97 KB
/
chardet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs, sys
from polyglot.builtins import unicode_type
_encoding_pats = (
# XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
# HTML 5 charset
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
# HTML 4 Pragma directive
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)
def compile_pats(binary):
for raw in _encoding_pats:
if binary:
raw = raw.encode('ascii')
yield re.compile(raw, flags=re.IGNORECASE)
class LazyEncodingPats(object):
def __call__(self, binary=False):
attr = 'binary_pats' if binary else 'unicode_pats'
pats = getattr(self, attr, None)
if pats is None:
pats = tuple(compile_pats(binary))
setattr(self, attr, pats)
for pat in pats:
yield pat
lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
prefix = raw[:limit]
suffix = raw[limit:]
is_binary = isinstance(raw, bytes)
if preserve_newlines:
if is_binary:
sub = lambda m: b'\n' * m.group().count(b'\n')
else:
sub = lambda m: '\n' * m.group().count('\n')
else:
sub = b'' if is_binary else u''
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
prefix = raw[:limit]
suffix = raw[limit:]
changed = [False]
is_binary = isinstance(raw, bytes)
if is_binary:
if not isinstance(enc, bytes):
enc = enc.encode('ascii')
else:
if isinstance(enc, bytes):
enc = enc.decode('ascii')
def sub(m):
ans = m.group()
if m.group(1).lower() != enc.lower():
changed[0] = True
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
ans = ans[:start] + enc + ans[end:]
return ans
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw, changed[0]
def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit]
is_binary = isinstance(raw, bytes)
for pat in lazy_encoding_pats(is_binary):
m = pat.search(prefix)
if m is not None:
ans = m.group(1)
if is_binary:
ans = ans.decode('ascii', 'replace')
return ans
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
def detect(bytestring):
from cchardet import detect as implementation
ans = implementation(bytestring)
enc = ans.get('encoding')
if enc:
ans['encoding'] = enc.lower()
elif enc is None:
ans['encoding'] = ''
if ans.get('confidence') is None:
ans['confidence'] = 0
return ans
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw[:1024*50])
except Exception:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1:
if verbose:
print(f'WARNING: Encoding detection confidence for {chardet["encoding"]} is {chardet["confidence"]}', file=sys.stderr)
if assume_utf8:
encoding = 'utf-8'
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
encoding = _CHARSET_ALIASES.get(encoding, encoding)
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
if not raw or isinstance(raw, unicode_type):
return raw, None
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
'-', '_'))
if raw.startswith(bom):
return raw[len(bom):], x
encoding = None
for pat in lazy_encoding_pats(True):
match = pat.search(raw)
if match:
encoding = match.group(1)
encoding = encoding.decode('ascii', 'replace')
break
if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'utf-8'
return raw, encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False, assume_utf8=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
if not raw:
return '', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8)
if not isinstance(raw, unicode_type):
raw = raw.decode(encoding, 'replace')
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
raw = substitute_entites(raw)
return raw, encoding