In [5]:
# Example 4-1. Encoding and decoding.
s = 'café'
print('len(s) =', len(s))
b = s.encode('utf8')
print('b =', b)
print('len(b) =', len(b))  #  “é” is encoded as two bytes in UTF-8
print("b.decode('utf8') =", b.decode('utf8'))

len(s) = 4
b = b'caf\xc3\xa9'
len(b) = 5
b.decode('utf8') = café


In [32]:
# Example 4-2. A five-byte sequence as bytes and as bytearray.
cafe = bytes('café', encoding='utf_8')
print('cafe =', cafe)
print('cafe[0] =', cafe[0])
print('cafe[:3] =', cafe[:3])  # !!!
print('cafe[:4] =', cafe[:4])  # !!!
print()
cafe_arr = bytearray('café', encoding='utf_8')
print('caf_arre =', cafe_arr)
print('cafe_arr[-1] =', cafe_arr[-1])
print('cafe_arr[-1:] =', cafe_arr[-1:])  # !!!
print()
print("bytes.fromhex = ", bytes.fromhex('31 4B CE A9'))

cafe = b'caf\xc3\xa9'
cafe[0] = 99
cafe[:3] = b'caf'
cafe[:4] = b'caf\xc3'

caf_arre = bytearray(b'caf\xc3\xa9')
cafe_arr[-1] = 169
cafe_arr[-1:] = bytearray(b'\xa9')

bytes.fromhex =  b'1K\xce\xa9'



In [34]:
# Example 4-3. Initializing bytes from the raw data of an array.
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
print("octets =", octets)

octets = b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


In [4]:
# Example 4-4 shows the use of memoryview and struct together to extract the width and height of a GIF image.
import struct
fmt = '<3s3sHH' # struct format: < little-endian; 3s3s two sequences of 3 bytes; HH two 16-bit integers.
with open('files/EU_P-kolor.gif', 'rb') as fp:
    img = memoryview(fp.read()) # Create memoryview from file contents in memory
header = img[:10] # …then another memoryview by slicing the first one; no bytes are copied here.
print(bytes(header)) # Convert to bytes for display only; 10 bytes are copied here.
print(struct.unpack(fmt, header)) # Unpack memoryview into tuple of: type, version, width and height
del header # Delete references to release the memory associated with the memoryview instances.
del img

b'GIF89a\x03\x03a\x01'
(b'GIF', b'89a', 771, 353)


In [7]:
# Example 4-5. The string “El Niño” encoded with three codecs producing very different byte sequences.
for codec in ['latin_1', 'utf-8', 'utf16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf-8	b'El Ni\xc3\xb1o'
utf16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [10]:
# Example 4-6. Encoding to bytes: success and error handling
city = 'São Paulo'
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [11]:
# Example 4-6.
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [14]:
# Example 4-6.
print(city.encode('cp437', errors='ignore'))
print(city.encode('cp437', errors='replace'))
print(city.encode('cp437', errors='xmlcharrefreplace'))

b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


In [18]:
# Example 4-7. Decoding
octets = b'Montr\xe9al'
print(octets.decode('cp1252'))
print(octets.decode('iso8859_7'))
print(octets.decode('utf_8', errors='replace'))
print(octets.decode('utf_8'))

Montréal
Montrιal
Montr�al


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [20]:
# coding: cp1252
print('Olá, Mundo!')

Olá, Mundo!


In [32]:
# BOM
u16 = 'El Niño'.encode('utf_16')
print(u16)
print(list(u16))

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'
[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]


In [33]:
# Little endian / Big endian
u16le = 'El Niño'.encode('utf_16le')
print(list(u16le))
u16be = 'El Niño'.encode('utf_16be')
print(list(u16be))

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]
[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]


In [37]:
# Example 4-9. A platform encoding issue. If you try this on your machine, you may or may not see the problem.
open('files/cafeW.txt', 'w').write('café')
open('files/cafeW.txt', encoding='cp1250').read()

'cafĂ©'

In [52]:
# Example 4-10. Closer inspection of Example 4-9 running on Windows reveals the bug and how to fix it.
fp = open('files/cafe.txt', 'w', encoding='utf_8')
print(fp)
print('fp.write', fp.write('café'))
fp.close()
import os
print('os.stat', os.stat('files/cafe.txt').st_size)
print()

fp2 = open('files/cafe.txt', encoding='cp1250')
print(fp2.encoding)
print(fp2.read())
print()

fp4 = open('files/cafe.txt', 'rb')
print(fp4.read())

<_io.TextIOWrapper name='files/cafe.txt' mode='w' encoding='utf_8'>
fp.write 4
os.stat 5

cp1250
cafĂ©

b'caf\xc3\xa9'


In [57]:
# Example 4-11. Exploring encoding defaults
import sys, locale
expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
    """
my_file = open('files/dummy', 'w')
for expression in expressions.split():
    value = eval(expression)   # !!!
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [62]:
# Normalizing Unicode for saner comparisons
s1 = 'café'
s2 = 'cafe\u0301'
print('s1, s2:', s1, s2)
print('len(s1), len(s2):', len(s1), len(s2))
s1==s2

s1, s2: café café
len(s1), len(s2): 4 5


False

In [70]:
# Unicode normalisation 'NFC', 'NFD'
from unicodedata import normalize
s1 = 'café' # composed "e" with acute accent
s2 = 'cafe\u0301' # decomposed "e" and acute accent
print('len(s1), len(s2):', len(s1), len(s2))
print('NFC: len(s1), len(s2):', len(normalize('NFC', s1)), len(normalize('NFC', s2)))
print('NFD: len(s1), len(s2):', len(normalize('NFD', s1)), len(normalize('NFD', s2)))
print()
print(normalize('NFC', s1) == normalize('NFC', s2))
print(normalize('NFD', s1) == normalize('NFD', s2))

len(s1), len(s2): 4 5
NFC: len(s1), len(s2): 4 4
NFD: len(s1), len(s2): 5 5

True
True


In [78]:
# ohm normalization
from unicodedata import normalize, name
ohm = '\u2126'
print('name(ohm)=', name(ohm))
ohm_c = normalize('NFC', ohm)
name(ohm_c)
print('name(ohm_c)=', name(ohm_c))
print(ohm == ohm_c)
print(normalize('NFC', ohm) == normalize('NFC', ohm_c))

name(ohm)= OHM SIGN
name(ohm_c)= GREEK CAPITAL LETTER OMEGA
False
True


In [88]:
# NFKC normalization
from unicodedata import normalize, name
half = '½'
print(half, normalize('NFKC', half))
four_squared = '4²'
print(four_squared, normalize('NFKC', four_squared))
micro = 'µ'
micro_kc = normalize('NFKC', micro)
print(micro, micro_kc)
print(ord(micro), ord(micro_kc))
print(name(micro),'  ', name(micro_kc))

½ 1⁄2
4² 42
µ μ
181 956
MICRO SIGN    GREEK SMALL LETTER MU


In [95]:
# Case folding
micro = 'µ'
print(name(micro))
micro_cf = micro.casefold()
print(name(micro_cf))
print('micro, micro_cf: ', micro, micro_cf)
eszett = 'ß'
print( name(eszett))
eszett_cf = eszett.casefold()
print('eszett, eszett_cf:', eszett, eszett_cf)

MICRO SIGN
GREEK SMALL LETTER MU
micro, micro_cf:  µ μ
LATIN SMALL LETTER SHARP S
eszett, eszett_cf: ß ss


In [101]:
#Example 4-13. normeq.py: normalized Unicode string comparison
from unicodedata import normalize
def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)
def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold())

s1 = 'café'
s2 = 'cafe\u0301'
s3 = 'Straße'
s4 = 'strasse'
print('s1 == s2: ', s1 == s2)
print('nfc_equal(s1, s2): ', nfc_equal(s1, s2))
print("nfc_equal(s3, s4):", nfc_equal(s3, s4))
print("nfc_equal('A', 'a'): ", nfc_equal('A', 'a'))
print()
print("fold_equal(s1, s2):", fold_equal(s1, s2))
print("fold_equal(s3, s4):", fold_equal(s3, s4))
print("fold_equal('A', 'a'):", fold_equal('A', 'a'))

s1 == s2:  False
nfc_equal(s1, s2):  True
nfc_equal(s3, s4): False
nfc_equal('A', 'a'):  False

fold_equal(s1, s2): True
fold_equal(s3, s4): True
fold_equal('A', 'a'): True


In [105]:
# Example 4-14. Function to remove all combining marks (module sanitize.py)
import unicodedata
import string
def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    print("norm_txt =", norm_txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    print('shaved =', shaved)
    return unicodedata.normalize('NFC', shaved)
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
print("shave_marks(order) =", shave_marks(order))
print()
Greek = 'Ζέφυρος, Zéfiro'
print("shave_marks(Greek) =", shave_marks(Greek))

norm_txt = “Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”
shaved = “Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”
shave_marks(order) = “Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”

norm_txt = Ζέφυρος, Zéfiro
shaved = Ζεφυρος, Zefiro
shave_marks(Greek) = Ζεφυρος, Zefiro


In [108]:
# Example 4-16. Function to remove combining marks from Latin characters. 
import unicodedata
import string
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue # ignore diacritic on Latin base char
        keepers.append(c)
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
print("shave_marks_latin(order) =", shave_marks_latin(order))
print()
Greek = 'Ζέφυρος, Zéfiro'
print("shave_marks_latin(Greek) =", shave_marks_latin(Greek))


shave_marks_latin(order) = “Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”

shave_marks_latin(Greek) = Ζέφυρος, Zefiro


In [114]:
# Example 4-17. Transform some Western typographical symbols into ASCII.
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", """'f"*^<''""---~>""")
multi_map = str.maketrans({
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
    })
multi_map.update(single_map)
print("multi_map:\n",  multi_map)
def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)
def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
print("dewinize(order) =", dewinize(order))
print("asciize(order) =", asciize(order))

multi_map:
 {710: 94, 8226: 45, 338: 'OE', 339: 'oe', 8212: 45, 8216: 39, 8217: 39, 8218: 39, 732: 126, 8221: 34, 8222: 34, 8224: 42, 8225: '**', 8482: '(TM)', 8230: '...', 8220: 34, 8364: '<euro>', 402: 102, 8240: '<per mille>', 8211: 45, 8249: 60, 8250: 62}
dewinize(order) = "Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."
asciize(order) = "Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."


In [6]:
# Example 4-19. Using the locale.strxfrm function as sort key.
fruits = ['Poznań', 'Warszawa', 'Gdańsk', 'Łódź', 'Lublin']
print('sorted(fruits): ', sorted(fruits))
print()
import locale
locale.setlocale(locale.LC_COLLATE, 'pl_PL.UTF-8')
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print('sorted_fruits: ', sorted_fruits)

sorted(fruits):  ['Gdańsk', 'Lublin', 'Poznań', 'Warszawa', 'Łódź']

sorted_fruits:  ['Gdańsk', 'Lublin', 'Łódź', 'Poznań', 'Warszawa']


In [10]:
# Example 4-20. Using the pyuca.Collator.sort_key method.
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
print('sorted(fruits): ', sorted(fruits))

import pyuca
coll = pyuca.Collator()
sorted_fruits = sorted(fruits, key=coll.sort_key)
print('sorted_fruits: ', sorted_fruits)

sorted(fruits):  ['acerola', 'atemoia', 'açaí', 'caju', 'cajá']
sorted_fruits:  ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


In [2]:
#Example 4-21. Demo of Unicode database numerical character metadata. Callouts describe each column in the output
import unicodedata
import re
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
re_digit = re.compile(r'\d')
for char in sample:
    print('U+%04x' % ord(char),
        char.center(6),
        're_dig' if re_digit.match(char) else '-',
        'isdig' if char.isdigit() else '-',
        'isnum' if char.isnumeric() else '-',
        format(unicodedata.numeric(char), '5.2f'),
        unicodedata.name(char),
        sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


In [17]:
# Example 4-22. ramanujan.py: compare behavior of simple str and bytes regular expressions
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef "
            "as 1729 = 1³ + 12³ = 9³ + 10³.")
print("text_str: ", text_str)
text_bytes = text_str.encode('utf_8')
print("text_bytes:", text_bytes)
print()
print('Text', repr(text_str), sep='\n ')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str))
print(' bytes:', re_numbers_bytes.findall(text_bytes))
print('Words')
print(' str :', re_words_str.findall(text_str))
print(' bytes:', re_words_bytes.findall(text_bytes))

text_str:  Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.
text_bytes: b'Ramanujan saw \xe0\xaf\xa7\xe0\xaf\xad\xe0\xaf\xa8\xe0\xaf\xaf as 1729 = 1\xc2\xb3 + 12\xc2\xb3 = 9\xc2\xb3 + 10\xc2\xb3.'

Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
 str : ['௧௭௨௯', '1729', '1', '12', '9', '10']
 bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
 str : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
 bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


In [1]:
# Example 4-23. listdir with str and bytes arguments and results.
import os
print("os.listdir('.'):")
for elem in  os.listdir('.'):
    print(elem)
print()
print("os.listdir(b'.'):")
for elem in os.listdir(b'.'):
    print(elem)

os.listdir('.'):
FP-01 Python Data Model | str 3 - 19.ipynb
FP-02 An array of sequences | str 19 - 63.ipynb
FP-04 Text versus bytes | str 97 - 139.ipynb
files
FP-03 Dictionaries and sets | str 63 - 97.ipynb
.ipynb_checkpoints
digits-of-π.txt

os.listdir(b'.'):
b'FP-01 Python Data Model | str 3 - 19.ipynb'
b'FP-02 An array of sequences | str 19 - 63.ipynb'
b'FP-04 Text versus bytes | str 97 - 139.ipynb'
b'files'
b'FP-03 Dictionaries and sets | str 63 - 97.ipynb'
b'.ipynb_checkpoints'
b'digits-of-\xcf\x80.txt'


In [1]:
# Example 4-24. listdir with str and bytes arguments and results.
import os
pi_name_bytes = os.listdir(b'.')[4]
pi_name_str = pi_name_bytes.decode('ascii', 'surrogateescape')
print("pi_name_str: ", pi_name_str)   # 'digits-of-\udccf\udc80.txt'
# print(" pi_name_bytes:",  pi_name_bytes)  #  b'digits-of-\xcf\x80.txt

pi_name_str:  files
