# Unicode Text Versus Bytes

## Character Issues

In [6]:
s = 'café'
print(type(s))
len(s) #1

<class 'str'>


4

In [7]:
b = s.encode('utf8') #2
print(type(b))
b #3

<class 'bytes'>


b'caf\xc3\xa9'

In [4]:
len(b) #4

5

In [5]:
b.decode('utf8') #5

'café'

## Byte Essentials

In [8]:
cafe = bytes('café', encoding='utf_8') #1
cafe

b'caf\xc3\xa9'

In [9]:
cafe[0] #2

99

In [12]:
cafe[:1] #3

b'c'

In [13]:
cafe[0] = cafe[1] # the binary sequence bytes is immutable

TypeError: 'bytes' object does not support item assignment

In [11]:
cafe_arr = bytearray(cafe)
cafe_arr #4

bytearray(b'caf\xc3\xa9')

In [14]:
cafe_arr[-1:] #5

bytearray(b'\xa9')

In [17]:
# binary sequences have a class method that str doesn't have, called fromhex, which builds a binary
# sequence by parsing pairs of hex digits optionally by spaces
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [18]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2]) #1
octets = bytes(numbers) #2
octets #3

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

## Basic Encoders/Decoders

In [19]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [26]:
latin_1 = b'El Ni\xf1o'
print(latin_1)
latin_1.decode('latin_1').encode('utf8')

b'El Ni\xf1o'


b'El Ni\xc3\xb1o'

## Understanding Encode/Decode Problems

### Coping with UnicodeEncodeError

In [1]:
city = "São Paulo"
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [2]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [3]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [4]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [5]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [6]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [7]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

### Coping with UnicodeDecodeError

In [8]:
octets = b'Montr\xe9a1'
octets.decode('cp1252')

'Montréa1'

In [9]:
octets.decode('iso8859_7')

'Montrιa1'

In [10]:
octets.decode('koi8_r')

'MontrИa1'

In [11]:
octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [12]:
octets.decode('utf_8', errors='replace')

'Montr�a1'

### SyntaxError When Loading Modules with Unexpected Encoding

In [13]:
# If you load a .py module containing non-UTF-8 data and no encoding declaration,
# you get a message like this:
#
# SyntaxError: Non-UTF-8 code starting with '\xe1' in file ola.py on line
# 1, but no encoding declared; see https://python.org/dev/peps/pep-0263/ for details

## See how to fix the problem when the file was created on Windows

In [14]:
# coding: cp1252

print('Olá, Mundo!')

Olá, Mundo!


### How do Discover the Encoding of a Byte Sequence

__Short answer: you can't. You must be told.__

### BOM: A Useful Gremlin

In [15]:
u16 = 'El Niño'.encode('utf_16')
u16

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [16]:
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [17]:
u16le = 'El Niño'.encode('utf_16le') # le -> little-endian
list(u16le)

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [18]:
u16le = 'El Niño'.encode('utf_16be') # le -> big-endian
list(u16be)

NameError: name 'u16be' is not defined

## Handling Text Files

In [1]:
open('cafe.txt', 'w', encoding='utf_8').write('café')

4

In [3]:
open('cafe.txt', encoding='cp1252').read() # forcing encoding because linux default to utf-8

'cafÃ©'

In [4]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp #1

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>

In [5]:
fp.write('café') #2

4

In [6]:
fp.close()

In [7]:
import os
os.stat('cafe.txt').st_size #3

5

In [8]:
fp2 = open('cafe.txt', encoding='cp1252')
fp2 #4

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>

In [9]:
fp2.encoding #5

'cp1252'

In [10]:
fp2.read() #6

'cafÃ©'

In [11]:
fp2.close()
fp3 = open('cafe.txt', encoding='utf_8') #7
fp3

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>

In [12]:
fp3.read() #8

'café'

In [13]:
fp3.close()
fp4 = open('cafe.txt', 'rb') #9
fp4 #10

<_io.BufferedReader name='cafe.txt'>

In [14]:
fp4.read() #11

b'caf\xc3\xa9'

In [15]:
fp4.close()

### Beware of Encoding Defaults

In [16]:
import locale
import sys

expressions = """
        locale.getpreferredencoding()
        type(my_file)
        my_file.encoding
        sys.stdout.isatty()
        sys.stdout.encoding
        sys.stdin.isatty()
        sys.stdin.encoding
        sys.stderr.isatty()
        sys.stderr.encoding
        sys.getdefaultencoding()
        sys.getfilesystemencoding()
"""

my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(f'{expression:>30} -> {value!r}')

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [18]:
import sys
from unicodedata import name

print(sys.version)
print()
print('sys.stdout.isatty():', sys.stdout.isatty())
print('sys.stdout.encoding:', sys.stdout.encoding)
print()

test_chars = [
    '\N{HORIZONTAL ELLIPSIS}',  # exists in cp1252, not in cp437
    '\N{INFINITY}',  # exists in cp437, not in cp1252
    '\N{CIRCLED NUMBER FORTY TWO}',  # not in cp437 or in cp1252
]

for char in test_chars:
    print(f'Trying to output {name(char)}:')
    print(char)

3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]

sys.stdout.isatty(): False
sys.stdout.encoding: UTF-8

Trying to output HORIZONTAL ELLIPSIS:
…
Trying to output INFINITY:
∞
Trying to output CIRCLED NUMBER FORTY TWO:
㊷


## Normalizing Unicode for Reliable Comparisons

In [1]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'
s1, s2

('café', 'café')

In [2]:
len(s1), len(s2)

(4, 5)

In [3]:
s1 == s2

False

In [5]:
# How to normalize: unicode.normalize()
from unicodedata import normalize

len(normalize('NFC', s1)), len(normalize('NFC', s2))


(4, 4)

In [6]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [7]:
normalize('NFC', s1) == normalize('NFC', s2)

True

In [8]:
normalize('NFD', s1) == normalize('NFD', s2)

True

In [9]:
from unicodedata import normalize, name
ohm = '\u2126'
name(ohm)

'OHM SIGN'

In [10]:
ohm_c = normalize('NFC', ohm)
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [11]:
ohm == ohm_c

False

In [12]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [2]:
from unicodedata import normalize, name

half = '\N{VULGAR FRACTION ONE HALF}'
print(half)

½


In [4]:
normalize('NFKC', half)

'1⁄2'

In [5]:
for char in normalize('NFKC', half):
    print(char, name(char), sep='\t')

1	DIGIT ONE
⁄	FRACTION SLASH
2	DIGIT TWO


In [6]:
four_squared = '4²'
normalize('NFKC', four_squared)

'42'

In [7]:
micro = 'µ'
micro_kc = normalize('NFKC', micro)
micro, micro_kc

('µ', 'μ')

In [8]:
ord(micro), ord(micro_kc)

(181, 956)

In [10]:
name(micro), name(micro_kc)

('MICRO SIGN', 'GREEK SMALL LETTER MU')

### Case Folding

In [11]:
micro = 'µ'
name(micro)

'MICRO SIGN'

In [12]:
micro_cf = micro.casefold()
name(micro_cf)

'GREEK SMALL LETTER MU'

In [13]:
micro, micro_cf

('µ', 'μ')

In [14]:
eszett = 'ß'
name(eszett)

'LATIN SMALL LETTER SHARP S'

In [15]:
eszett_cf = eszett.casefold()
eszett, eszett_cf

('ß', 'ss')

### Utility Functions for Normalized Text Matching

In [1]:
"""
Utility functions for normalized Unicode string comparison.

Using Normal Form C, case sensitive:
    >>> s1 = 'café'
    >>> s2 = 'cafe\u0301
    >>> s1 == s2
    False
    >>> nfc_equal(s1, s2)
    True
    >>> nfc_equal('A', 'a')
    False

Using Normal Form C with case folding:
    >>> s3 = 'Straße
    >>> s4 = 'strasse'
    >>> s3 == s4
    False
    >>> nfc_equal(s3, s4)
    False
    >>> fold_equal(s3, s4)
    True
    >>> fold_equal('A', 'a')
    True
"""

from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold()

In [3]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2

False

In [4]:
nfc_equal(s1, s2)

True

In [5]:
nfc_equal('A', 'a')

False

In [6]:
s3 = 'Straße'
s4 = 'strasse'
s3 == s4

False

In [7]:
nfc_equal(s3, s4)

False

In [8]:
fold_equal(s3, s4)

True

In [9]:
fold_equal('A', 'a')

True

### Extreme "Normalization": Taking Out Diacritics

In [None]:
import unicodedata
import string


# Remove all diacritics from a str
def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_text = unicodedata.normalize('NFD', txt)  #1
    shaved = ''.join(c for c in norm_text if not unicodedata.combining(c))  #2
    return unicodedata.normalize('NFC', shaved)  #3

In [None]:
# Remove combining marks from Latin characters
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_text = unicodedata.normalize('NFD', txt)  #1
    latin_base = False
    preserve = []

    for c in norm_text:
        if unicodedata.combining(c) and latin_base:  #2
            continue  # ignore diacritic on Latin base char
        preserve.append(c)  #3
        # if it isn't a combining char, it's a new base char
        if not unicodedata.combining(c):  #4
            latin_base = c in string.ascii_letters
    
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)  #5

# check page 142 for more details

## Sorting Unicode Text

In [1]:
fruits = ['cajú', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)

['acerola', 'atemoia', 'açaí', 'cajá', 'cajú']

In [2]:
# The standard way to sort non-ASCII text in Python is to use the locale.strxfrm which, accordint
# to the docs, transforms a string to one tha can be used in locale-aware comparisons.
import locale

my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print(my_locale)

fruits = ['cajú', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits)

pt_BR.UTF-8
['açaí', 'acerola', 'atemoia', 'cajá', 'cajú']


### Sorting with the Unicode Collation Algorithm

In [1]:
# pip install pyuca: a pure-Python implementation of the Unicode Collation Algorithm
import pyuca

coll = pyuca.Collator()
fruits = ['cajú', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
print(sorted_fruits)

['açaí', 'acerola', 'atemoia', 'cajá', 'cajú']


## The Unicode Database

### Finding Characters by Name

In [5]:
from unicodedata import name

print(name('🨁'))
print(name('😺'))

NEUTRAL CHESS QUEEN
SMILING CAT FACE WITH OPEN MOUTH


In [10]:
import sys
import unicodedata

START, END = ord(' '), sys.maxunicode + 1  #1

def find(*query_words, start=START, end=END):  #2
    query = {w.upper() for w in query_words}  #3
    for code in range(start, end):
        char = chr(code)  #4
        name = unicodedata.name(char, None)  #5
        if name and query.issubset(name.split()):  #6
            print(f'U+{code:04X}\t{char}\t{name}')  #7

def main(words):
    if words:
        find(*words)
    else:
        print('Please provide words to find.')

main(('o', 'rato', '🐀', 'roel', 'a', 'roupa', 'do', 'rei', '👑', 'açaí', 'ß'))

### Numeric Meaning of Characters

In [1]:
import unicodedata
import re

re_digit = re.compile(r'\d')
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(f'U+{ord(char):04x}',  #1
          char.center(6),  #2
          're_dig' if re_digit.match(char) else '-',  #3
          'isdig' if char.isdigit() else '-',  #4
          'isnum' if char.isnumeric() else '-',  #5
          f'{unicodedata.numeric(char):5.2f}',  #6
          unicodedata.name(char),  #7
          sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


## Dual-Mode str and bytes APIs

### str Versus bytes in Regular Expressions

In [3]:
import re

re_numbers_str = re.compile(r'\d+')  #1
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')  #2
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  #3
            " as 1729 = 1³ + 12³ = 9³ + 10³.")  #4

text_bytes = text_str.encode('utf_8')  #5

print(f'Text\n {text_str!r}')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))  #6
print('  bytes:', re_numbers_bytes.findall(text_bytes))  #7
print('Words')
print('  str  :', re_words_str.findall(text_str))  #8
print('  bytes:', re_words_bytes.findall(text_bytes))  #9


Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


### str Versus bytes in os Functions

In [4]:
import os

os.listdir('.')

['cafe.txt', 'dummy', 'unicode_text_vs_bytes.ipynb']

In [5]:
os.listdir(b'.')

[b'cafe.txt', b'dummy', b'unicode_text_vs_bytes.ipynb']