In [4]:
'0'

'0'

In [1]:
chr(0)

'\x00'

In [2]:
ord(chr(0))

0

In [3]:
ord('0')

48

In [5]:
'0'.__repr__()

"'0'"

In [6]:
print(chr(0))

 


In [7]:
"this is a test" + chr(0) + "string"

'this is a test\x00string'

In [8]:
print("this is a test" + chr(0) + "string")

this is a test string


In [9]:
test_string = "hello! こんにちは!"

In [10]:
utf8_encoded = test_string.encode("utf-8")

In [11]:
print(utf8_encoded)

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'


In [12]:
print(type(utf8_encoded))

<class 'bytes'>


In [15]:
print(list(utf8_encoded))

[104, 101, 108, 108, 111, 33, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 33]


In [16]:
len(test_string)

13

In [17]:
len(utf8_encoded)

23

In [18]:
utf8_encoded.decode('utf-8')

'hello! こんにちは!'

The bug in the original function is that it assumes each byte represents a complete character, but UTF-8 is designed so that:
1. ASCII characters (0-127) use 1 byte each ✓
2. Non-ASCII characters use 2-4 bytes that must be decoded together ✗
3. Continuation bytes (10xxxxxx pattern) are invalid when decoded alone ✗

The function works fine for pure ASCII text but fails for any international characters, accented letters, or emojis because it tries to decode the multi-byte sequences one byte at a time instead of as complete units.

In [19]:
def decode_utf8_bytes_to_str_wrong(bytestring: bytes):
    return "".join([bytes([b]).decode("utf-8") for b in bytestring])

In [22]:
def decode_utf8_bytes_to_str_correct(bytestring: bytes):
    return bytestring.decode('utf-8')

In [20]:
decode_utf8_bytes_to_str_wrong("hello".encode("utf-8"))

'hello'

In [21]:
# Test with different types of text
test_cases = [
    "Hello",           # ASCII (1 byte per char)
    "café",            # Latin with accent (1-2 bytes per char)
    "こんにちは",        # Japanese (3 bytes per char)
    "🌍",              # Emoji (4 bytes per char)
    "Hello 世界 🌍",    # Mixed content
]

for text in test_cases:
    utf8_bytes = text.encode('utf-8')
    print(f"\nOriginal text: {text}")
    print(f"UTF-8 bytes: {list(utf8_bytes)}")
    print(f"Correct decode: {utf8_bytes.decode('utf-8')}")
    
    try:
        wrong_result = decode_utf8_bytes_to_str_wrong(utf8_bytes)
        print(f"Wrong function: {wrong_result}")
        print(f"Match? {wrong_result == text}")
    except UnicodeDecodeError as e:
        print(f"Wrong function: ERROR - {e}")


Original text: Hello
UTF-8 bytes: [72, 101, 108, 108, 111]
Correct decode: Hello
Wrong function: Hello
Match? True

Original text: café
UTF-8 bytes: [99, 97, 102, 195, 169]
Correct decode: café
Wrong function: ERROR - 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data

Original text: こんにちは
UTF-8 bytes: [227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175]
Correct decode: こんにちは
Wrong function: ERROR - 'utf-8' codec can't decode byte 0xe3 in position 0: unexpected end of data

Original text: 🌍
UTF-8 bytes: [240, 159, 140, 141]
Correct decode: 🌍
Wrong function: ERROR - 'utf-8' codec can't decode byte 0xf0 in position 0: unexpected end of data

Original text: Hello 世界 🌍
UTF-8 bytes: [72, 101, 108, 108, 111, 32, 228, 184, 150, 231, 149, 140, 32, 240, 159, 140, 141]
Correct decode: Hello 世界 🌍
Wrong function: ERROR - 'utf-8' codec can't decode byte 0xe4 in position 0: unexpected end of data


In [23]:
# Examples of invalid 2-byte UTF-8 sequences
invalid_sequences = [
    ([0xC0, 0x80], "Overlong encoding of NULL (should be just 0x00)"),
    ([0xC1, 0x80], "Overlong encoding of 0x40 (should be just 0x40)"),
    ([0xFF, 0xFE], "Invalid UTF-8 bytes (0xFF never appears in UTF-8)"),
    ([0x80, 0x80], "Two continuation bytes with no start byte"),
    ([0xC2, 0x40], "Start byte + non-continuation byte"),
    ([0xC2, 0xFF], "Start byte + invalid continuation byte"),
    ([0xED, 0xA0], "High surrogate (reserved for UTF-16, invalid in UTF-8)"),
]

print("Invalid 2-byte UTF-8 sequences:")
for sequence, description in invalid_sequences:
    print(f"\nSequence: {sequence} ({[hex(b) for b in sequence]})")
    print(f"Binary: {[f'{b:08b}' for b in sequence]}")
    print(f"Description: {description}")
    
    try:
        decoded = bytes(sequence).decode('utf-8')
        print(f"Result: {decoded}")
    except UnicodeDecodeError as e:
        print(f"ERROR: {e}")

Invalid 2-byte UTF-8 sequences:

Sequence: [192, 128] (['0xc0', '0x80'])
Binary: ['11000000', '10000000']
Description: Overlong encoding of NULL (should be just 0x00)
ERROR: 'utf-8' codec can't decode byte 0xc0 in position 0: invalid start byte

Sequence: [193, 128] (['0xc1', '0x80'])
Binary: ['11000001', '10000000']
Description: Overlong encoding of 0x40 (should be just 0x40)
ERROR: 'utf-8' codec can't decode byte 0xc1 in position 0: invalid start byte

Sequence: [255, 254] (['0xff', '0xfe'])
Binary: ['11111111', '11111110']
Description: Invalid UTF-8 bytes (0xFF never appears in UTF-8)
ERROR: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

Sequence: [128, 128] (['0x80', '0x80'])
Binary: ['10000000', '10000000']
Description: Two continuation bytes with no start byte
ERROR: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

Sequence: [194, 64] (['0xc2', '0x40'])
Binary: ['11000010', '01000000']
Description: Start byte + non-continuation byte