### 예제10. 플랫폼 인코딩 문제 (리눅스)

In [1]:
open('cafe.text', 'w', encoding='utf8').write('café')

4

In [2]:
open('cafe.text').read()

'café'

### 예제11. 인코딩 기본값 알아보기

In [3]:
import sys, locale

expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""

my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'utf-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'utf-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


### 예제12. 정규화 방식 비교

In [11]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'

len(normalize('NFC',s1)), len(normalize('NFC',s2))

(4, 4)

In [12]:
len(normalize('NFD',s1)), len(normalize('NFD',s2))

(5, 5)

In [13]:
normalize('NFC',s1), normalize('NFC',s2)

('café', 'café')

In [14]:
normalize('NFD',s1), normalize('NFD',s2)

('café', 'café')

In [15]:
normalize('NFC',s1) == normalize('NFC',s2)

True

In [16]:
normalize('NFD',s1) == normalize('NFD',s2)

True

### 예제13. NFC에 의해 다른 문자로 정규화 되는 경우

In [5]:
from unicodedata import normalize, name

ohm = '\u2126'
name(ohm)

'OHM SIGN'

In [6]:
ohm_c = normalize('NFC', ohm)
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [7]:
ohm, ohm_c, ohm == ohm_c

('Ω', 'Ω', False)

In [8]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

### 예제14. 케이스 폴딩 예

In [9]:
eszett = 'ß'
name(eszett)

'LATIN SMALL LETTER SHARP S'

In [10]:
eszett_cf = eszett.casefold()
eszett_cf2 = eszett.lower()
eszett, eszett_cf, eszett_cf2

('ß', 'ss', 'ß')

### 예제15. 정규화된 유니코드 문자열 비교

In [17]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2

False

In [19]:
from unicodedata import normalize


def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() ==
            normalize('NFC', str2).casefold())
    
nfc_equal(s1, s2)

True

In [20]:
nfc_equal('A', 'a')

False

In [21]:
s3 = 'Straße'
s4 = 'strasse'

s3 == s4

False

In [22]:
nfc_equal(s3, s4)

False

In [23]:
fold_equal(s3, s4)

True

In [24]:
fold_equal(s1, s2)

True

In [25]:
fold_equal('A', 'a')

True

### 예제16. 결합 표시를 모두 제거하는 함수 및 예제

In [32]:
import unicodedata
import string

def shave_marks(txt):
    """발음 구별 기호를 모두 제거한다"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt 
                     if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
shave_marks(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [33]:
Greek = 'Ζέφυρος, Zéfiro'
shave_marks(Greek)

'Ζεφυρος, Zefiro'

In [29]:
shave_marks('Straße')

'Straße'

In [28]:
shave_marks('café')

'cafe'

### 예제17. 라틴 문자에서 결합 표시를 모두 제거하는 함수

In [30]:
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    
    latin_base = False
    keepers = []
    
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        
        keepers.append(c)
        
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    
    shaved = ''.join(keepers)
    
    return unicodedata.normalize('NFC', shaved)

### 예제18. 서양 활자 기호를 아스키 코드로 변환

In [34]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",
                            """'f"*^<''""---~>""")

multi_map = str.maketrans({
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)

def dewinize(txt):
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

In [35]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
dewinize(order)

'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'

In [36]:
asciize(order)

'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'

### 예제19. 비아스키 문자 정렬 예제

In [31]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)

['acerola', 'atemoia', 'açaí', 'caju', 'cajá']

### 예제20. pyuca.Collator.sort_key() 메서드 활용

In [38]:
!pip install pyuca


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [39]:
import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

### 예제21. 유니코드 데이터베이스 수치형 문자 메타데이터 사용 예

In [40]:
import unicodedata
import re

re_digit = re.compile(r'\d')
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
  print('U+%04x' % ord(char),
        char.center(6),
        're_dig' if re_digit.match(char) else '-',
        'isdig' if char.isdigit() else '-',
        'isnum' if char.isnumeric() else '-',
        format(unicodedata.numeric(char), '5.2f'),
        unicodedata.name(char),
        sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


### 예제22. ramanujan.py: 간단한 str과 bytes 정규 표현식의 동작 비교

In [41]:
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
            "as 1729 = 1³ + 12³ = 9³ + 10³.")

text_bytes = text_str.encode('utf_8')

print('Text', repr(text_str), sep='\n  ')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))
print('  bytes:', re_numbers_bytes.findall(text_bytes))
print('Words')
print('  str  :', re_words_str.findall(text_str))
print('  bytes:', re_words_bytes.findall(text_bytes))

Text
  'Ramanujan saw ௧௭௨௯as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


### 예제23. str과 bytes 인수로 호출한 listdir() 메서드와 결과

In [42]:
import os
os.listdir('.')

['project2.ipynb',
 'sejong.txt',
 'dummy',
 'project4.ipynb',
 'project1.ipynb',
 'cafe.text',
 'project3.ipynb',
 'project5.ipynb',
 '6주차_실습.ipynb',
 'README.md',
 '.gitignore',
 'simpsons.gif',
 'floats.bin',
 '.git',
 '.vscode']

In [43]:
os.listdir(b'.')

[b'project2.ipynb',
 b'sejong.txt',
 b'dummy',
 b'project4.ipynb',
 b'project1.ipynb',
 b'cafe.text',
 b'project3.ipynb',
 b'project5.ipynb',
 b'6\xe1\x84\x8c\xe1\x85\xae\xe1\x84\x8e\xe1\x85\xa1_\xe1\x84\x89\xe1\x85\xb5\xe1\x86\xaf\xe1\x84\x89\xe1\x85\xb3\xe1\x86\xb8.ipynb',
 b'README.md',
 b'.gitignore',
 b'simpsons.gif',
 b'floats.bin',
 b'.git',
 b'.vscode']

### 예제24. surrogateescape 에러처리

In [47]:
os.listdir('./fluent_python')

['digits-of-π.txt']

In [48]:
os.listdir(b'./fluent_python')

[b'digits-of-\xcf\x80.txt']

In [49]:
pi_name_bytes = os.listdir(b'.')[1]
pi_name_str = pi_name_bytes.decode('ascii', 'surrogateescape')
pi_name_str

'sejong.txt'