src/calibre/ebooks/chardet.py

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai


__license__   = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import re, codecs, sys
from polyglot.builtins import unicode_type

_encoding_pats = (
    # XML declaration
    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
    # HTML 5 charset
    r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
    # HTML 4 Pragma directive
    r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)


def compile_pats(binary):
    for raw in _encoding_pats:
        if binary:
            raw = raw.encode('ascii')
        yield re.compile(raw, flags=re.IGNORECASE)


class LazyEncodingPats(object):

    def __call__(self, binary=False):
        attr = 'binary_pats' if binary else 'unicode_pats'
        pats = getattr(self, attr, None)
        if pats is None:
            pats = tuple(compile_pats(binary))
            setattr(self, attr, pats)
        for pat in pats:
            yield pat


lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')


def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
    prefix = raw[:limit]
    suffix = raw[limit:]
    is_binary = isinstance(raw, bytes)
    if preserve_newlines:
        if is_binary:
            sub = lambda m: b'\n' * m.group().count(b'\n')
        else:
            sub = lambda m: '\n' * m.group().count('\n')
    else:
        sub = b'' if is_binary else u''
    for pat in lazy_encoding_pats(is_binary):
        prefix = pat.sub(sub, prefix)
    raw = prefix + suffix
    return raw


def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
    prefix = raw[:limit]
    suffix = raw[limit:]
    changed = [False]
    is_binary = isinstance(raw, bytes)
    if is_binary:
        if not isinstance(enc, bytes):
            enc = enc.encode('ascii')
    else:
        if isinstance(enc, bytes):
            enc = enc.decode('ascii')

    def sub(m):
        ans = m.group()
        if m.group(1).lower() != enc.lower():
            changed[0] = True
            start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
            ans = ans[:start] + enc + ans[end:]
        return ans

    for pat in lazy_encoding_pats(is_binary):
        prefix = pat.sub(sub, prefix)
    raw = prefix + suffix
    return raw, changed[0]


def find_declared_encoding(raw, limit=50*1024):
    prefix = raw[:limit]
    is_binary = isinstance(raw, bytes)
    for pat in lazy_encoding_pats(is_binary):
        m = pat.search(prefix)
        if m is not None:
            ans = m.group(1)
            if is_binary:
                ans = ans.decode('ascii', 'replace')
                return ans


def substitute_entites(raw):
    from calibre import xml_entity_to_unicode
    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)


_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}


def detect(bytestring):
    from cchardet import detect as implementation
    ans = implementation(bytestring)
    enc = ans.get('encoding')
    if enc:
        ans['encoding'] = enc.lower()
    elif enc is None:
        ans['encoding'] = ''
    if ans.get('confidence') is None:
        ans['confidence'] = 0
    return ans


def force_encoding(raw, verbose, assume_utf8=False):
    from calibre.constants import preferred_encoding
    try:
        chardet = detect(raw[:1024*50])
    except Exception:
        chardet = {'encoding':preferred_encoding, 'confidence':0}
    encoding = chardet['encoding']
    if chardet['confidence'] < 1:
        if verbose:
            print(f'WARNING: Encoding detection confidence for {chardet["encoding"]} is {chardet["confidence"]}', file=sys.stderr)
        if assume_utf8:
            encoding = 'utf-8'
    if not encoding:
        encoding = preferred_encoding
    encoding = encoding.lower()
    encoding = _CHARSET_ALIASES.get(encoding, encoding)
    if encoding == 'ascii':
        encoding = 'utf-8'
    return encoding


def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
    if not raw or isinstance(raw, unicode_type):
        return raw, None
    for x in ('utf8', 'utf-16-le', 'utf-16-be'):
        bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
            '-', '_'))
        if raw.startswith(bom):
            return raw[len(bom):], x
    encoding = None
    for pat in lazy_encoding_pats(True):
        match = pat.search(raw)
        if match:
            encoding = match.group(1)
            encoding = encoding.decode('ascii', 'replace')
            break
    if encoding is None:
        encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
    if encoding.lower().strip() == 'macintosh':
        encoding = 'mac-roman'
    if encoding.lower().replace('_', '-').strip() in (
            'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
            'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
        # Microsoft Word exports to HTML with encoding incorrectly set to
        # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
        encoding = 'gbk'
    try:
        codecs.lookup(encoding)
    except LookupError:
        encoding = 'utf-8'

    return raw, encoding


def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
                   resolve_entities=False, assume_utf8=False):
    '''
    Force conversion of byte string to unicode. Tries to look for XML/HTML
    encoding declaration first, if not found uses the chardet library and
    prints a warning if detection confidence is < 100%
    @return: (unicode, encoding used)
    '''
    if not raw:
        return '', None
    raw, encoding = detect_xml_encoding(raw, verbose=verbose,
            assume_utf8=assume_utf8)
    if not isinstance(raw, unicode_type):
        raw = raw.decode(encoding, 'replace')

    if strip_encoding_pats:
        raw = strip_encoding_declarations(raw)
    if resolve_entities:
        raw = substitute_entites(raw)

    return raw, encoding