## 4.1 문자 문제

In [5]:
s = 'café'
len(s)

4

In [8]:
b = s.encode('utf8')
b

b'caf\xc3\xa9'

In [9]:
len(b)

5

In [10]:
b.decode('utf8')

'café'

## 4.2 바이트에 대한 기본 지식

In [13]:
cafe = bytes('café', encoding='utf8')
cafe

b'caf\xc3\xa9'

In [14]:
cafe[0]

99

In [15]:
cafe[:1]

b'c'

In [16]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [19]:
cafe_arr[0]=98
cafe_arr, cafe

(bytearray(b'baf\xc3\xa9'), b'caf\xc3\xa9')

In [23]:
bytes.fromhex('31 4b CD A9')

b'1K\xcd\xa9'

In [24]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octest = bytes(numbers)
octest

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

## 4.3 기본 인코더/디코더

In [31]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


## 4.4 인코딩/디코딩 문제 이해하기

In [34]:
city = 'São Paulo'
city.encode('utf8')

b'S\xc3\xa3o Paulo'

In [36]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [37]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [38]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [40]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [41]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [42]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [44]:
octest = b'Montr\xe9al'
octest.decode('cp1252')

'Montréal'

In [45]:
octest.decode('iso8859_7')

'Montrιal'

In [46]:
octest.decode('koi8_r')

'MontrИal'

In [47]:
octest.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [48]:
octest.decode('utf_8', errors='replace')

'Montr�al'

In [50]:
ε = 10**-6
ε

1e-06

In [54]:
u16 = 'El Niño'.encode('utf-16')
u16, list(u16)

(b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00',
 [255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0])

In [56]:
u16le = 'El Niño'.encode('utf-16le')
list(u16le)

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [57]:
u16be = 'El Niño'.encode('utf-16be')
list(u16be)

[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]

## 4.5 텍스트 파일 다루기

In [58]:
open('cafe.txt', 'w', encoding='utf-8').write('café')

4

In [59]:
open('cafe.txt').read()

'café'

In [60]:
fp = open('cafe.txt', 'w', encoding='utf-8')
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf-8'>

In [62]:
fp.write('café')

4

In [63]:
fp.close()

In [64]:
import os
os.stat('cafe.txt').st_size

5

In [65]:
fp = open('cafe.txt')
fp

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='UTF-8'>

In [66]:
fp.encoding

'UTF-8'

In [67]:
fp.read()

'café'

In [68]:
fp4= open('cafe.txt', 'rb')
fp4

<_io.BufferedReader name='cafe.txt'>

In [69]:
fp4.read()

b'caf\xc3\xa9'

In [91]:
import sys, locale


expressions = """
    locale.getpreferredencoding()
    type(my_file)
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""

my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


## 4.6 제대로 비교하기 위해 유니코드 정규화하기

In [94]:
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2

('café', 'café')

In [95]:
len(s1), len(s2)

(4, 5)

In [96]:
s1 == s2

False

In [98]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
len(s1), len(s2)

(4, 5)

In [99]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [100]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [101]:
normalize('NFC', s1) == normalize('NFC', s2)

True

In [102]:
normalize('NFD', s1) == normalize('NFD', s2)

True

In [109]:
normalize('NFD', s1).encode(), normalize('NFC', s1).encode()

(b'cafe\xcc\x81', b'caf\xc3\xa9')

In [110]:
help(normalize)

Help on built-in function normalize in module unicodedata:

normalize(form, unistr, /)
    Return the normal form 'form' for the Unicode string unistr.
    
    Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.



In [112]:
from unicodedata import normalize, name
ohm = '\u2126'
name(ohm)

'OHM SIGN'

In [113]:
ohm_c = normalize('NFC', ohm)
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [114]:
ohm == ohm_c

False

In [115]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [122]:
from unicodedata import normalize, name
half = '½'
normalize('NFC', half)

'½'

In [126]:
four_squared = '42'
normalize('NFKC', four_squared)

'42'

In [134]:
micro = 'µ'
micro_kc = normalize('NFKC', micro)
micro, micro_kc

('µ', 'μ')

In [135]:
ord(micro), ord(micro_kc)

(181, 956)

In [136]:
name(micro), name(micro_kc)

('MICRO SIGN', 'GREEK SMALL LETTER MU')

In [137]:
micro = 'µ'
name(micro)

'MICRO SIGN'

In [140]:
micro_cf = micro.casefold()
name(micro_cf)

'GREEK SMALL LETTER MU'

In [141]:
micro, micro_cf

('µ', 'μ')

In [142]:
eszett = 'ß'
name(eszett)

'LATIN SMALL LETTER SHARP S'

In [144]:
eszett_cf = eszett.casefold()
eszett, eszett_cf

('ß', 'ss')

In [147]:
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold()

s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2

False

In [149]:
nfc_equal(s1, s2)

True

In [150]:
nfc_equal('A', 'a')

False

In [152]:
s3 = 'straße'
s4 = 'strasse'
s3 == s4

False

In [153]:
nfc_equal(s3, s4)

False

In [154]:
fold_equal(s3, s4)

True

In [155]:
fold_equal(s1, s2)

True

In [156]:
fold_equal('A', 'a')

True

In [162]:
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
shave_marks(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [167]:
Greek = 'Ζέφυρος, Zéfiro'
shave_marks(Greek)

'Ζεφυρος, Zefiro'

In [168]:
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

shave_marks_latin(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [169]:
shave_marks_latin(Greek)

'Ζέφυρος, Zefiro'

In [170]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [176]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
                           """'f"*^<''""---~>""")

multi_map = str.maketrans({  # <2>
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)

def dewinize(txt):
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFC', no_marks)

dewinize(order)

'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'

In [177]:
asciize(order)

'"Herr Voss: - ½ cup of OEtker(TM) caffe latte - bowl of acai."'

## 4.7 유니코드 텍스트 정렬하기

In [178]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'aceroal']
sorted(fruits)

['aceroal', 'atemoia', 'açaí', 'caju', 'cajá']

In [184]:
import locale

locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')

'pt_BR.UTF-8'

In [185]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'aceroal']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
sorted_fruits

['aceroal', 'atemoia', 'açaí', 'caju', 'cajá']

In [189]:
import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'aceroal']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits

['açaí', 'aceroal', 'atemoia', 'cajá', 'caju']

## 4.8 유니코드 데이터베이스

In [213]:
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print('U+%04x' % ord(char),
          char.center(6),
          're_dig' if re_digit.match(char) else '-',
          'isdig' if char.isdigit() else '-',
          'isnum' if char.isnumeric() else '-',
          format(unicodedata.numeric(char), '5.2f'),
          sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00
U+00bc	  ¼   	-	-	isnum	 0.25
U+00b2	  ²   	-	isdig	isnum	 2.00
U+0969	  ३   	re_dig	isdig	isnum	 3.00
U+136b	  ፫   	-	isdig	isnum	 3.00
U+216b	  Ⅻ   	-	-	isnum	12.00
U+2466	  ⑦   	-	isdig	isnum	 7.00
U+2480	  ⒀   	-	-	isnum	13.00
U+3285	  ㊅   	-	-	isnum	 6.00


## 4.9 이중 모드 str 및 bytes API

In [226]:
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  # <3>
            " as 1729 = 1³ + 12³ = 9³ + 10³.")        # <4>

text_bytes = text_str.encode('utf-8')

print('Text', repr(text_str), sep='\n  ')
print('Numbers')
print('   str   :', re_numbers_str.findall(text_str))
print('   bytes :', re_numbers_bytes.findall(text_bytes))
print('Words')
print('   str   :', re_words_str.findall(text_str))
print('   bytes :', re_words_bytes.findall(text_bytes))

Text
  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
   str   : ['௧௭௨௯', '1729', '1', '12', '9', '10']
   bytes : [b'1729', b'1', b'12', b'9', b'10']
Words
   str   : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
   bytes : [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


In [227]:
os.listdir('.')

['floats-10M.npy',
 'dummy',
 'Chapter2_시퀀스.ipynb',
 'cafe.txt',
 'Chapter1_파이썬데이터모델.ipynb',
 'Chapter4_텍스트와 바이트.ipynb',
 'zen.txt',
 '.ipynb_checkpoints',
 'floats.bin',
 'Chapter3_딕셔너리와 집합.ipynb']

In [235]:
os.listdir(b'.')

[b'floats-10M.npy',
 b'dummy',
 b'Chapter2_\xec\x8b\x9c\xed\x80\x80\xec\x8a\xa4.ipynb',
 b'cafe.txt',
 b'Chapter1_\xed\x8c\x8c\xec\x9d\xb4\xec\x8d\xac\xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0\xeb\xaa\xa8\xeb\x8d\xb8.ipynb',
 b'Chapter4_\xed\x85\x8d\xec\x8a\xa4\xed\x8a\xb8\xec\x99\x80 \xeb\xb0\x94\xec\x9d\xb4\xed\x8a\xb8.ipynb',
 b'zen.txt',
 b'.ipynb_checkpoints',
 b'floats.bin',
 b'Chapter3_\xeb\x94\x95\xec\x85\x94\xeb\x84\x88\xeb\xa6\xac\xec\x99\x80 \xec\xa7\x91\xed\x95\xa9.ipynb']

In [240]:
kor_name_bytes = os.listdir(b'.')[2]
kor_name_str = kor_name_bytes.decode('ascii', 'surrogateescape')
kor_name_str

'Chapter2_\udcec\udc8b\udc9c\udced\udc80\udc80\udcec\udc8a\udca4.ipynb'

In [241]:
kor_name_str.encode('ascii', 'surrogateescape')

b'Chapter2_\xec\x8b\x9c\xed\x80\x80\xec\x8a\xa4.ipynb'