## 1. Pythonic Thinking

### 03 Know the Differences Between `bytes` and `str`

In [1]:
import logging

In [2]:
a = b'h\x65llo'
print(list(a))
print(a)

[104, 101, 108, 108, 111]
b'hello'


In [3]:
a = 'a\u0300 propos'
print(list(a))
print(a)

['a', '̀', ' ', 'p', 'r', 'o', 'p', 'o', 's']
à propos


In [4]:
def to_str(bytes_or_str):
    if isinstance(bytes_or_str, bytes):
        value = bytes_or_str.decode('utf-8')
    else:
        value = bytes_or_str
    return value  # Instance of str

print(repr(to_str(b'foo')))
print(repr(to_str('bar')))
print(repr(to_str(b'\xed\x95\x9c\xea\xb8\x80')))

'foo'
'bar'
'한글'


In [5]:
def to_bytes(bytes_or_str):
    if isinstance(bytes_or_str, str):
        value = bytes_or_str.encode('utf-8')
    else:
        value = bytes_or_str
    return value  # Instance of bytes

print(repr(to_bytes(b'foo')))
print(repr(to_bytes('bar')))
print(repr(to_bytes('한글')))

b'foo'
b'bar'
b'\xed\x95\x9c\xea\xb8\x80'


In [6]:
print(b'one' + b'two')
print('one' + 'two')

b'onetwo'
onetwo


In [7]:
try:
    b'one' + 'two'
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-7-ab17154e36ca>", line 2, in <module>
    b'one' + 'two'
TypeError: can't concat str to bytes


In [8]:
try:
    'one' + b'two'
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-8-c75c54f97817>", line 2, in <module>
    'one' + b'two'
TypeError: can only concatenate str (not "bytes") to str


In [9]:
assert b'red' > b'blue'
assert 'red' > 'blue'

In [10]:
try:
    assert 'red' > b'blue'
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-10-bd080907f0d1>", line 2, in <module>
    assert 'red' > b'blue'
TypeError: '>' not supported between instances of 'str' and 'bytes'


In [11]:
try:
    assert b'blue' < 'red'
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-11-1944c3059cc6>", line 2, in <module>
    assert b'blue' < 'red'
TypeError: '<' not supported between instances of 'bytes' and 'str'


In [12]:
print(b'foo' == 'foo')

False


In [13]:
print(b'red %s' % b'blue')
print('red %s' % 'blue')

b'red blue'
red blue


In [14]:
try:
    print(b'red %s' % 'blue')
except:
    logging.exception('Expected')
else:
    assert False


ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-14-6635a8d5c366>", line 2, in <module>
    print(b'red %s' % 'blue')
TypeError: %b requires a bytes-like object, or an object that implements __bytes__, not 'str'


In [15]:
print('red %s' % b'blue')

red b'blue'


In [16]:
try:
    with open('data.bin', 'w') as f:
        f.write(b'\xf1\xf2\xf3\xf4\xf5')
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-16-78254cecedd5>", line 3, in <module>
    f.write(b'\xf1\xf2\xf3\xf4\xf5')
TypeError: write() argument must be str, not bytes


In [17]:
with open('data.bin', 'wb') as f:
    f.write(b'\xf1\xf2\xf3\xf4\xf5')

In [18]:
try:
    # Silently force UTF-8 here to make sure this test fails on
    # all platforms. cp1252 considers these bytes valid on Windows.
    real_open = open
    def open(*args, **kwargs):
        kwargs['encoding'] = 'utf-8'
        return real_open(*args, **kwargs)
    
    with open('data.bin', 'r') as f:
        data = f.read()
except:
    logging.exception('Expected')
    
    open = real_open
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "<ipython-input-18-4db810e6b6dd>", line 10, in <module>
    data = f.read()
  File "/usr/lib/python3.7/codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 0: invalid continuation byte


In [19]:
with open('data.bin', 'rb') as f:
    data = f.read()

assert data == b'\xf1\xf2\xf3\xf4\xf5'

In [20]:
with open('data.bin', 'r', encoding='cp1252') as f:
    data = f.read()

assert data == 'ñòóôõ'

> - `bytes`에는 8비트 값의 시퀀스가 들어 있고, `str`에는 유니코드 코드 포인트의 시퀀스가 들어 있다.
> - 처리할 입력이 원하는 문자 시퀀스(8비트 값, UTF-8로 인코딩된 문자열, 유니코드 코드 포인ㅌ들)인지 확실히 하려면 헬퍼 함수를 사용하라.
> - `bytes`와 `str` 인스턴스를 (`>`, `==`, `+`, `%`와 같은) 연산자에 섞어서 사용할 수 없다.
> - 이진 데이터를 파일에서 읽거나 파일에 쓰고 싶으면 항상 이진 모드(`'rb'`나 `'wb'`)로 파일을 열어라.
> - 유니코드 데이터를 파일에서 읽거나 파일에 쓰고 싶을 때는 시스템 디폴트 인코딩에 주의하라. 인코딩 차이로 놀라고 싶지 않으면 `open`에 `encoding` 파라미터를 명시적으로 전달하라.