In [13]:
import os
import re
import sys
import array
import locale
import unicodedata
from unicodedata import name, normalize


In [14]:
s = 'cafe咖啡'
b = s.encode("utf8")
print(b)
print(b.decode("utf8"))


b'cafe\xe5\x92\x96\xe5\x95\xa1'
cafe咖啡


In [15]:
print(bytes.fromhex('31 4b CE A9'))

numbers = array.array('h', [-2, -1, 0, 1, 2])
print(bytes(numbers))

b'1K\xce\xa9'
b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


In [16]:
for codec in ['utf-8', 'utf-16']:
    print(codec, 'nihao 你好'.encode(codec), sep='\t')

print()
city = '佛山:foshan'
print(city)
print(city.encode('utf_8'))
print(city.encode('utf_16'))
try :
    print(city.encode('cp437'))
except:
    print(f"UnicodeError: \"{city}\" encode cp437")

print(city.encode('cp437', errors='ignore'))            # skip unencodable character
print(city.encode('cp437', errors='replace'))           # substitude unencodable character with '?'
print(city.encode('cp437', errors='xmlcharrefreplace'))
print("isascii:", city.isascii())


utf-8	b'nihao \xe4\xbd\xa0\xe5\xa5\xbd'
utf-16	b'\xff\xfen\x00i\x00h\x00a\x00o\x00 \x00`O}Y'

佛山:foshan
b'\xe4\xbd\x9b\xe5\xb1\xb1:foshan'
b'\xff\xfe[Oq\\:\x00f\x00o\x00s\x00h\x00a\x00n\x00'
UnicodeError: "佛山:foshan" encode cp437
b':foshan'
b'??:foshan'
b'&#20315;&#23665;:foshan'
isascii: False


In [17]:
octets = b'Motr\xe9al'
print(octets)
print(octets.decode('cp1252'))
print(octets.decode('iso8859_1'))
print(octets.decode('utf8', errors='replace'))

b'Motr\xe9al'
Motréal
Motréal
Motr�al


In [18]:
print(locale.getpreferredencoding())
print(sys.version)
print("sys.stdout.isatty():", sys.stdout.isatty())
print("sys.stdout.encoding:", sys.stdout.encoding)

test_char = [
    '\N{HORIZONTAL ELLIPSIS}',
    '\N{INFINITY}',
    '\N{CIRCLED NUMBER FORTY TWO}'
]
for char in test_char:
    print(f'{name(char)}:\n{char}')

cp65001
3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]
sys.stdout.isatty(): False
sys.stdout.encoding: UTF-8
HORIZONTAL ELLIPSIS:
…
INFINITY:
∞
CIRCLED NUMBER FORTY TWO:
㊷


In [27]:
print('============= NFC NFD =============')
ohm = '\u2126'
print(ohm, name(ohm))
ohm_c = normalize('NFC', ohm)
print(ohm_c, name(ohm_c))
ohm_d = normalize('NFD', ohm)
print(ohm_d, name(ohm_d))
print('len(ohm):', len(ohm), '\tlen(ohm_c):', len(ohm_c))
print('ohm_c == ohm:', ohm == ohm_c)
print('ohm_c == ohm_d:', ohm_c == ohm_d)
print(normalize('NFC', ohm) == normalize('NFC', ohm_c))

print('\n============ NFKC NFKD =============')
half = '\N{VULGAR FRACTION ONE HALF}'
print(half)
print(normalize('NFKC', half))
print(normalize('NFKD', half))

print('\n=========== Case Folding ===========')
mu = '\N{MICRO SIGN}'
print(mu, mu.casefold())

es = '\N{LATIN SMALL LETTER SHARP S}'
print(es, es.casefold())

Ω OHM SIGN
Ω GREEK CAPITAL LETTER OMEGA
Ω GREEK CAPITAL LETTER OMEGA
len(ohm): 1 	len(ohm_c): 1
ohm_c == ohm: False
ohm_c == ohm_d: True
True

½
1⁄2
1⁄2

µ μ
ß ss


In [20]:
print('\N{BLACK CHESS QUEEN}')
print('\N{GRINNING CAT FACE WITH SMILING EYES}')
print('\N{SMILING CAT FACE WITH OPEN MOUTH}')
print('\N{SMILING CAT FACE WITH HEART-SHAPED EYES}')

♛
😸
😺
😻


In [21]:
re_digit = re.compile(r'\d')
sample = '1\xcb\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(f'U+{ord(char):04X}',
          char.center(6),
          're_digit' if re_digit.match(char) else '-',
          'isdig' if char.isdigit() else '-',
          'isnum' if char.isnumeric() else '-',
          unicodedata.name(char), 
          sep='\t')

U+0031	  1   	re_digit	isdig	isnum	DIGIT ONE
U+00CB	  Ë   	-	-	-	LATIN CAPITAL LETTER E WITH DIAERESIS
U+00B2	  ²   	-	isdig	isnum	SUPERSCRIPT TWO
U+0969	  ३   	re_digit	isdig	isnum	DEVANAGARI DIGIT THREE
U+136B	  ፫   	-	isdig	isnum	ETHIOPIC DIGIT THREE
U+216B	  Ⅻ   	-	-	isnum	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	CIRCLED IDEOGRAPH SIX


In [25]:
pattern = re.compile(r'\d')
result = pattern.search("Hello, 123 World!")
print(result, '\n')

result = pattern.findall("Hello, 123 World!")
print(result)

<re.Match object; span=(7, 8), match='1'> 

['1', '2', '3']


In [23]:
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
            " as 1729 = 1³ + 12³ = 9³ + 10³.")

text_bytes = text_str.encode('utf_8')

print(f'Text\n  {text_str!r}\n  {text_bytes!r}')
print('\nNumbers')
print('  str  :', re_numbers_str.findall(text_str))     # match tamil digits and ASCII digits
print('  bytes:', re_numbers_bytes.findall(text_bytes)) # match ASCII digits only
print('\nWords')
print('  str  :', re_words_str.findall(text_str))       # match letters, superscripts, Tamil and ASCII digits
print('  bytes:', re_words_bytes.findall(text_bytes))   # match only the ASCII bytes for letters and digits

Text
  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
  b'Ramanujan saw \xe0\xaf\xa7\xe0\xaf\xad\xe0\xaf\xa8\xe0\xaf\xaf as 1729 = 1\xc2\xb3 + 12\xc2\xb3 = 9\xc2\xb3 + 10\xc2\xb3.'

Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']

Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


In [24]:
# listdir with str and bytes arguments and results

print(os.listdir('.'))
print()
print(os.listdir(b'.'))

['TextBytes.ipynb']

[b'TextBytes.ipynb']
