Handling Text Files Examples

In [6]:
open('cafe.txt', 'w', encoding='utf_8').write('café')
open('cafe.txt').read()
# If opening on a windows machine, this may result in a decoding error.

'café'

In [8]:
fp = open('cafe.txt', 'w', encoding='utf_8')
print(fp)
fp.write('café')
fp.close()

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>


In [18]:
import os

print(os.stat('cafe.txt').st_size)
# fp2 = open('cafe.txt') # Note, will default encoding type to cp1252 if on windows
fp2 = open('cafe.txt', encoding='cp1252')
print(fp2.read())
print(fp2.encoding)
fp2.close()

fp3 = open('cafe.txt', 'rb')  # Opens file for reading in binary mode
print(fp3)
print(fp3.read())

5
cafÃ©
cp1252
<_io.BufferedReader name='cafe.txt'>
b'caf\xc3\xa9'


Normalizing Unicode Examples

In [19]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'
print(s1, s2)
print(len(s1), len(s2))
print(s1 == s2)

café café
4 5
False


Extreme Normalization Examples 

In [20]:
import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

order = '“Herr Voß: • 1⁄2 cup of ŒtkerTM caffè latte • bowl of açaí.”'
print(shave_marks(order))
# Note that the special e, c, i were replaced

greek = 'Ζέφυρος, Zéfiro'
print(shave_marks(greek))
# Note that the two special e's were replaced

“Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai.”
Ζεφυρος, Zefiro


In [None]:
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        preserve.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)

In [35]:
# Example Failed due to copy issues on the unicode text supplied from the book.
# single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–— ̃›""", """'ƒ"^<''""---~>""")
# multi_map = str.maketrans({
#     '€': 'EUR',
#     '...': '...',
#     'Æ': 'AE',
#     'æ': 'ae',
#     'Œ': 'OE',
#     'œ': 'oe',
#     'TM': '(TM)',
#     '‰': '<per mille>',
#     '†': '**',
#     '‡': '***',
# })

# multi_map.update(single_map)


Sorting Unicode Text Examples

In [38]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
print(sorted(fruits))

['acerola', 'atemoia', 'açaí', 'caju', 'cajá']


In [40]:
import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
print(sorted_fruits)

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


Finding Characters by Name Examples

In [42]:
from unicodedata import name
print(name('A'))

LATIN CAPITAL LETTER A


In [45]:
import sys
# import unicodedata

START, END = ord(' '), sys.maxunicode + 1    # Set the default search range

def find(*query_words, start=START, end=END):
    """Accepts query_words and optional kwargs to limit the search range."""
    
    query = {w.upper() for w in query_words} # Convert the query_words to an uppercase strings
    for code in range(start, end):
        char = chr(code)                     # Get the unicode character
        name = unicodedata.name(char, None)  # Get the char name
        if name and query.issubset(name.split()): # If the name exists, see if the name is a subset of the query
            print(f'U+{code:04X}\t{char}\t{name}') # Print the line with code point in the U+9999 format.

def main(words):
    if words:
        find(*words)
    else:
        print("Please provide words to find.")

if __name__ == '__main__':
    main(sys.argv[1:])

Numeric Meaning of Characters Examples

In [48]:
import unicodedata
import re

re_digit = re.compile(r'\d')
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print (f'U+{ord(char):04x}',  
            char.center(6),
            're_dig' if re_digit.match(char) else '-', # If the character matches the r'\d' regex
            'is_dig' if char.isdigit() else '-',
            'isnum' if char.isnumeric() else '-',
            f'{unicodedata.numeric(char):5.2f}',
            unicodedata.name(char),
            sep='\t'
        )

U+0031	  1   	re_dig	is_dig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	is_dig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	is_dig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	is_dig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	is_dig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX
