In [1]:
s = 'cafe'
len(s)

4

In [4]:
b = s.encode('utf8')
b

b'cafe'

In [5]:
b.decode('utf8')

'cafe'

In [6]:
cafe = bytes('cafe', encoding='utf_8')
cafe

b'cafe'

In [7]:
cafe[0]

99

In [8]:
cafe[:1]

b'c'

In [9]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'cafe')

In [10]:
cafe_arr[-1:]

bytearray(b'e')

In [11]:
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [12]:
import array

numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [13]:
for codec in ['latin_1', 'utf_8', 'utf_16'] : 
    print(codec, 'dedade'.encode(codec), sep='\t')

latin_1	b'dedade'
utf_8	b'dedade'
utf_16	b'\xff\xfed\x00e\x00d\x00a\x00d\x00e\x00'


In [14]:
name = 'Lee Jong Heon'
name.casefold()

'lee jong heon'

In [19]:
from unicodedata import normalize


def nfc_equal(str1, str2) :
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2) :
    return (normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold())

s1 = 'k'
s2 = 'k'

nfc_equal(s1, s2)

True

In [21]:
import unicodedata, string

def shave_marks(txt) : 
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

order = "Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí."
shave_marks(order)
    

'Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.'

In [22]:
greek = 'Ζέφυρος, Zéfiro'
shave_marks(greek)

'Ζεφυρος, Zefiro'

In [31]:
# 라틴 문자에서 결합 표시 기호를 제거하는 함수
"""
unicodedata.normalize()?

- 유니코드 문자열을 특정 형태로 정규화하는데 사용
- 유니코드 정규화는 문자열의 일관성을 보장하고, 비교나 검색 작업을 수행할 때 유용함
    - (1) NFC : 가능한 한 유니코드 문자들을 조합 문자(composed character)로 표현
    - (2) NFD : 가능한 한 유니코드 문자들을 분해 문자(decomposed character)로 표현
    - (3) NFKC : 호환성 문자(compatibility character)를 포함하여 가능한 한 조합 문자로 표현
    - (4) NFKD : 호환성 문자를 포함하여 가능한 한 분해 문자로 표현
    
    
unicodedata.combining
- 특정 유니코드 문자가 결합 문자(combining character)인지 여부를 확인하는 데 사용
"""

def shave_marks_latin(txt) : 
    norm_txt = unicodedata.normalize('NFD', txt)
    print("> " + norm_txt)
    latin_base = False
    
    keepers = []
    for c in norm_txt :
        print(">> " + c)
        if unicodedata.combining(c) and latin_base : 
            continue
            
        keepers.append(c)
        
        if not unicodedata.combining(c) : 
            latin_base = c in string.ascii_letters
    
    shaved = "".join(keepers)
    return unicodedata.normalize('NFC', shaved)

order = "Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí."
shave_marks_latin(order)

> Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.
>> H
>> e
>> r
>> r
>>  
>> V
>> o
>> ß
>> :
>>  
>> •
>>  
>> ½
>>  
>> c
>> u
>> p
>>  
>> o
>> f
>>  
>> Œ
>> t
>> k
>> e
>> r
>> ™
>>  
>> c
>> a
>> f
>> f
>> e
>> ̀
>>  
>> l
>> a
>> t
>> t
>> e
>>  
>> •
>>  
>> b
>> o
>> w
>> l
>>  
>> o
>> f
>>  
>> a
>> c
>> ̧
>> a
>> i
>> ́
>> .


'Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.'

In [33]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # 문자 : 문자, 치환을 위한 매핑 테이블
                           """'f"*^<''""---~>""")

multi_map = str.maketrans({  # 문자 : 문자열, 치환을 위한 매핑 테이블
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)  # 위에 매핑 테이블 머지 


def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)  # 


def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))     # dewinize(txt) -> 발음 구별 기호 제거
    no_marks = no_marks.replace('ß', 'ss')          # 치환
    return unicodedata.normalize('NFKC', no_marks)  # NFKC 정규화를 적용 

In [34]:
dewinize(order)


'Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí.'

In [35]:
asciize(order)

> Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí.
>> H
>> e
>> r
>> r
>>  
>> V
>> o
>> ß
>> :
>>  
>> -
>>  
>> ½
>>  
>> c
>> u
>> p
>>  
>> o
>> f
>>  
>> O
>> E
>> t
>> k
>> e
>> r
>> (
>> T
>> M
>> )
>>  
>> c
>> a
>> f
>> f
>> e
>> ̀
>>  
>> l
>> a
>> t
>> t
>> e
>>  
>> -
>>  
>> b
>> o
>> w
>> l
>>  
>> o
>> f
>>  
>> a
>> c
>> ̧
>> a
>> i
>> ́
>> .


'Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai.'

In [36]:
fruist = ['caju', 'atemoia', 'caja', 'acai', 'acerola']
sorted(fruist)

['acai', 'acerola', 'atemoia', 'caja', 'caju']

In [38]:
import locale

"""
locale.strxfrm() 함수를 키로 사용하기 전에 setlocale(LC_COLLATE, <지역 언어>) 먼저 설정
"""



locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
sorted_fruits = sorted(fruist, key=locale.strxfrm)
sorted_fruits

['acai', 'acerola', 'atemoia', 'caja', 'caju']

In [40]:
import re

re_numbers_str = re.compile(r'\d+')     # <1>
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')  # <2>
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  # <3>
            " as 1729 = 1³ + 12³ = 9³ + 10³.")        # <4>

text_bytes = text_str.encode('utf_8')  # <5>

print('Text', repr(text_str), sep='\n  ')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))      # <6>
print('  bytes:', re_numbers_bytes.findall(text_bytes))  # <7>
print('Words')
print('  str  :', re_words_str.findall(text_str))        # <8>
print('  bytes:', re_words_bytes.findall(text_bytes))

Text
  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']
