In [3]:
# Python comes with variety of encoders and decoders

for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [4]:
# Coping with UnicodeEncodeError
# Caused when a particular codec can't encode a char in your string
city = 'São Paulo'
city.encode('utf-8')

b'S\xc3\xa3o Paulo'

In [5]:
city.encode('utf-16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [6]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [9]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [10]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [11]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [12]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [None]:
# The codec error handling is extensible you may register extra
# strings for the errors argument by passing name and error handling
# function to the codescs.register_errror function

In [None]:
# Coping with UnicodeDecodeError

In [13]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [14]:
octets.decode('iso8859_7')

'Montrιal'

In [15]:
octets.decode('koi8_r')

'MontrИal'

In [16]:
octets.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [17]:
octets.decode('utf-8', errors='replace')

'Montr�al'

In [None]:
# if you ever open or import a py script with encoded with something
# else then 'utf-8' you must specify encoding on top line of file
# like coding: cp1252
# By far the best solution is to convert everything to 'utf-8'

In [None]:
# How to discover the encoding of a byte sequence
# You can't. You must be told.
# Or be smart use stats and heuristics and a chardet package

In [18]:
u16 = 'El Niño'.encode('utf-16')
u16

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [None]:
# First two bytes are BOM byte-order-mark
# They denote the little endian byte ordering

In [19]:
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [None]:
"""
On a little endian machine E is encoded as 69 0
On a big endian it would be 0 69
"""

In [21]:
u16 = 'El Niño'.encode('utf_16le')
list(u16)  # le means specifically little endian

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [22]:
u16 = 'El Niño'.encode('utf_16be')
list(u16)  # be means specifically little endian

[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]