In [1]:
# The following topics are covered in this chapter:
## characters, code points, and byte representations
## unique features of binary sequences: `bytes`, `bytearray`, and `memoryview`
## codecs for full Unicode and legacy character sets
## avoiding and dealing with encoding errors
## best practices when handling text files
## the default encoding trap and standard I/O issues
## safe Unicode text comparisons with normalization
## utility functions for normalization, case folding, and brute-force diacritic removal
## proper sorting of Unicode text with `locale` and the PyUCA library
## character metadata in the Unicode database
## dual-mode APIs that handle `str` and `bytes`

In [2]:
# Character Issues
# The best definition of "character" we have is a Unicode character
# The items you get out of a pythong `str` are Unicode characters.
# Unicode standard explicitly separates the identity of characters from specific byte representations. 
# `code point`: the identity of a character, is a number from 0 to 1,114,111 shown in the Unicode standard as 4-6 hex digits with a U+ prefix
## e.g. code point for the letter `A` is U+0041.
# `encoding`: the actual bytes that represent a character via a specific algorithm.
## The alg converts code points to byte sequences and vice-versa. 
## e.g. The code point for `A` (U+0041) is encoded as the single byte \x41 in UTF-8, but as \x41\x00 in UTF-16LE

In [7]:
# Encoding and decoding
s = 'café'
print(len(s))

# `é` is encoded as two bytes in UTF-8, so the length of b is now 5 bytes
b = s.encode('utf8')
print(b)
print(len(b))

print(b.decode('utf8'))

4
b'caf\xc3\xa9'
5
café


In [8]:
# Byte Essentials
# Two basic, built-in types for binary sequences: immutable `bytes` and mutable `bytearray`.
# Each item in `bytes` or `bytearray` is an integer from 0 to 255.
# A slice of a binary sequence always produces a binary sequence of the same type, including slices of length 1.

In [10]:
# bytes can be built from a `str` given an encoding
cafe = bytes('café', encoding='utf_8')
print(cafe)
# Each item is an integer in range(256)
print(cafe[0])
# Slices of bytes are also bytes -- even slices of a single byte
print(cafe[:1])

# There is no literal syntax for bytearray: they are shown as bytearray() with a bytes literal as argument
cafe_arr = bytearray(cafe)
print(cafe_arr)
# A slice of bytearray is also a bytearray
print(cafe_arr[-1:])

b'caf\xc3\xa9'
99
b'c'
bytearray(b'caf\xc3\xa9')
bytearray(b'\xa9')


In [12]:
# Three different displays are used depending on the byte value:
## For bytes in printable ASCII range (~ to space), the ASCII character itself is used
## For bytes corresponding to tab, newline, carriage return, and \, the escape sequences are used (e.g. \t, \n, \t, \\)
## For every other byte value, a hex escape sequence is used (e.g. \x00 is the null byte)
### This is why in the above example, you see b'caf\xc3\xa9'. The first three bytes are in the printable ASCII range, the last two (é) are not

In [14]:
# Binary sequences have a class method that `str` doesn't have: `fromhex`, which builds a binary sequence by parsing pairs of hex digits optionally separated by spaces
print(bytes.fromhex('31 4b CE A9'))

# Other ways of building bytes of bytearray are calling their constructors with:
## a str and an encoding keyword argument
## an iterable providing items with values from 0 to 255
## a single integer, to create a binary sequence of that size initialized with null bytes
## an object that implements the buffer protocol

b'1K\xce\xa9'


In [17]:
import array
# Typecode 'h' creates an array of short integers (16 bits)
numbers = array.array('h', [-2, -1, 0, 1, 2])
# octets holds a copy of the bytes that make up numbers
octets = bytes(numbers)
# These are the 10 bytes that represent the 5 short integers
print(octets)

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


In [18]:
# Structs and memory view
# The struct module provides functions to parse pakced bytes into a tuple of fields of different types and to perform the opposite conversion, from a tuple into packed bytes.
# struct is used with bytes, bytearray, and memoryview objects

In [23]:
import struct
import os

asset_dir = os.path.join(os.path.dirname(os.getcwd()), "assets")

# struct format: < little-endian; 3s3s two seq of 3 bytes; HH two 16-bit int
fmt = "<3s3sHH"
with open(os.path.join(asset_dir, "filter.gif"), "rb") as fp:
    img = memoryview(fp.read())
    
# Create memoryview by slicing first memoryview; no bytes are copied here
header = img[:10]
# Convers to bytes for display only; 10 bytes are copied here
print(bytes(header))
# Unpack memoryview into tuple of: type, version, width, and height
print(struct.unpack(fmt, header))

# Delete reference to release the memory associated with the memoryview instances
del header
del img

b'GIF89a+\x02\xe6\x00'
(b'GIF', b'89a', 555, 230)


In [24]:
# Basic encoders/decoders
# Python bundles more than 100 codecs (encoder/decoder) for text to byte conversion and vice-versa.
# Each codec has a name, like 'utf_8', often with aliases ('utf8', 'utf-8', 'U8')
# Some encodings cannot represent every Unicode character. The UTF encodings however are designed to handle every Unicode code point.

In [25]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep="\t")

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [26]:
# Understanding encode/decode problems
# There is the generic UnicodeError exception, but there is usually a more specific error
# such as the UnicodeEncodeError (when converting str to binary seq) or UnicodeDecodeError (when reading binary seq into str).

In [27]:
# Coping with UnicodeEncodeError
# Most non-UTF codecs handle only a small subset of Unicode characters.
# When converting text to bytes, if a character is not defined in the target encoding, UnicodeEncodeError will be raised.

In [31]:
city = 'São Paulo'
# The utf_? encodings handle any str
print(city.encode('utf_8'))
print(city.encode('utf_16'))

# This encoding also works for São Paulo
print(city.encode('iso8859_1'))

# cp437 can't encode the ã
try:
    city.encode('cp437')
except UnicodeEncodeError as e:
    print(e)
    
# error='ignore' silently skips characters that can't be encoded; usually a very bad idea
print(city.encode('cp437', errors='ignore'))
# error='replace' substitutes unencodable characters with '?'; data is lost but users will know something is amiss
print(city.encode('cp437', errors='replace'))
# 'xmlcharrefreplace' replaces unencodable characters with a XML entity
print(city.encode('cp437', errors='xmlcharrefreplace'))

b'S\xc3\xa3o Paulo'
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
b'S\xe3o Paulo'
'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>
b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


In [32]:
# Coping with UnicodeDecodeError
# Not every byte holds a valid ASCII character, and not every byte seq is valid UTF-8 or UTF-16.
# Therefore when you assume one of these encodings while converting a binary seq to text, you will get a UnicodeDecodeError if unexpected bytes are found.
# On the other hand, many legacy 8-bit encodings like 'cp1252', 'iso8859_1', or 'koi8_r' are able to decode any stream of bytes, including random noise, without generating errors. Therefore, if your program assumes the wrong 8-bit encoding, it will silently decode garbage.

In [37]:
# These bytes are the characters for “Montréal” encoded as latin1
octets = b'Montr\xe9al'
# Decoding with 'cp1252' (Windows 1252) works b/c it is a proper subset of latin1
print(octets.decode('cp1252'))
# ISO-8859-7 is intended for Greek, so the \xe9 byte is misinterpreted
print(octets.decode('iso8859_7'))
# KOI8-R is for Russian
print(octets.decode('koi8_r'))
# UTF_8 detects the bytes are not valid UTF-8, and raises the UnicodeDecodeError
try:
    print(octets.decode('utf_8'))
except UnicodeDecodeError as e:
    print(e)
# Using 'replace' error handling replaces the character with the question mark block    
print(octets.decode('utf_8', errors='replace'))

Montréal
Montrιal
MontrИal
'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte
Montr�al


In [38]:
# SyntaxError when loading modules with unexpected encoding
# UTF-8 is the default source encoding for Python3.
# If you load a .py module containing non-UTF-8 data and no encoding declaration, you'll get a SyntaxError
# To fix this problem, add a magic `coding` comment at the top of the file:
# coding: cp1252

In [39]:
# How to discover the encoding of a byte sequence
# Short answer: you can't; you must be told
# Some communication protocols and files formats (HTTP, XML) contain headers that explicitly tell us how the content is encoded. 
# You can be sure that some bytes streams are not ASCII b/c they contain bytes values over 127, and UTF-8/16 are built in such a way as to limit possible byte sequences.
# Looking at these patterns, we can determine with reasonable confidence which encoding a byte sequence uses

In [None]:
# Continued from page 109