In [2]:
#예제1. 인코딩과 디코딩

s = 'café'
len(s)

4

In [3]:
b = s.encode('utf8')
b

b'caf\xc3\xa9'

In [4]:
len(b)

5

In [5]:
b.decode('utf8')

'café'

In [6]:
#예제2. bytes와 bytearray로 저장한 5바이트 시퀀스

cafe = bytes('café', encoding='utf_8')
cafe

b'caf\xc3\xa9'

In [7]:
cafe[0]

99

In [8]:
cafe[:1]

b'c'

In [9]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [10]:
cafe_arr[-1:]

bytearray(b'\xa9')

In [11]:
#예제3. 16진수 쌍을 파싱해서 이진 시퀀스로 만들기

bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [12]:
#예제4. 배열의 원시 데이터에서 bytes 초기화하기

import array

numbers = array.array('h', [-2,-1,0,1,2])
numbers

array('h', [-2, -1, 0, 1, 2])

In [13]:
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [16]:
#예제 5. memoryview와 struct를 사용해서 GIF 이미지 헤더 조사하기

import struct

fmt = '<3s3sHH'

with open('./simpsons.gif', 'rb') as fp:
  img = memoryview(fp.read())

header = img[:10]
bytes(header)

b'GIF89a\xf3\x01k\x01'

In [17]:
struct.unpack(fmt, header)

(b'GIF', b'89a', 499, 363)

In [18]:
del header
del img

In [19]:
#예제 6. 전혀 다른 바이트 시퀀스를 만드는 세 개의 코덱으로 인코딩한 'El Niño' 문자열

for codec in ['latin_1', 'utf8', 'utf16']:
  print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf8	b'El Ni\xc3\xb1o'
utf16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [20]:
for codec in ['ascii','latin_1','cp1252','cp437','utf8','utf-16le','gb2312']:
  print(codec, 'A'.encode(codec), sep='\t')

ascii	b'A'
latin_1	b'A'
cp1252	b'A'
cp437	b'A'
utf8	b'A'
utf-16le	b'A\x00'
gb2312	b'A'


In [21]:
#예제 7. 바이트로 인코딩하기 : 성공 및 에러 처리

city = 'São Paulo'
city.encode('utf-8')

b'S\xc3\xa3o Paulo'

In [22]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [23]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [24]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [25]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [26]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [27]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [29]:
#예제 8. str에서 bytes로 디코딩하기: 성공 및 처리

octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [30]:
octets.decode('iso8859_7')

'Montrιal'

In [31]:
octets.decode('koi8_r')

'MontrИal'

In [32]:
octets.decode('utf8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [33]:
octets.decode('utf8', errors='replace')

'Montr�al'

In [34]:
'é'.encode('utf8')

b'\xc3\xa9'

In [35]:
#예제9. BOM: 깨진 문자

u16 = 'El Niño'.encode('utf16')
u16

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [36]:
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [37]:
u16le = 'El Niño'.encode('utf_16le')
list(u16le)

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [38]:
u16be = 'El Niño'.encode('utf_16be')
list(u16be)

[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]