- date: 2021-08-20 10:17:17
- author: Jerry Su
- slug: Python-Unicode
- title: Python Unicode
- category: 
- tags: Python

In [71]:
# http://www.unicode.org/reports/tr44/#GC_Values_Table
import unicodedata

In [26]:
unicodedata.unidata_version

'12.1.0'

In [72]:
# unicode类型枚举
# https://www.fileformat.info/info/unicode/category/index.htm

print(unicodedata.category('.'))
print(unicodedata.category('-'))
print(unicodedata.category(','))
print(unicodedata.category(' '))
#unicodedata.category('a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎')
for ch in 'a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎':
    print(unicodedata.category(ch))

Po
Pd
Po
Zs
Ll
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn
Mn


In [54]:
# http://c.biancheng.net/c/ascii/
# ord()返回十进制
print(ord(' '))
print(ord('\n'))
print(ord('\r'))
print(ord('\t'))

32
10
13
9


In [59]:
# 判断空格
def is_space(ch):
    """空格类字符判断判断。
    
    空格字符包括：' ', '\n', '\t', 'r'
    """
    return ch == ' ' or \
           ch == '\t' or \
           ch == '\r' or \
           ch == '\n' or \
           unicodedata.category(ch) == 'Zs'  # [Zs] Separator, Space

print(is_space(' '))
print(is_space('\n'))
print(is_space('\r'))
print(is_space('\t'))
print(is_space('A'))

True
True
True
True
False


In [61]:
# 判断标点符号
def is_punctuation(ch):
    """标点符号类字符判断（包含全/半角）。
    
    [33, 47]    ! " # $ % & ' ( ) * + , - . /
    [58, 64]    : ; < = > ? @
    [91, 96]    [ \ ] ^ _ `
    [123, 126]  { | } ~
    """
    code = ord(ch)
    return 33 <= code <= 47 or \
           58 <= code <= 64 or \
           91 <= code <= 96 or \
           123 <= code <= 126 or \
           unicodedata.category(ch).startswith('P')
is_punctuation('?')

True

In [95]:
def is_control(ch):
    """控制类字符判断
    """
    return unicodedata.category(ch) in ('Cc', 'Cf')

In [101]:
def _is_cjk_character(ch):
    """CJK类字符判断（包括中文字符也在此列）
    参考：https://en.wikipedia.org/wiki/Unicode_block
    """
    code = ord(ch)
    return 0x4E00 <= code <= 0x9FFF or \      # CJK Unified Ideographs, HAN
           0x3400 <= code <= 0x4DBF or \      # CJK Unified Ideographs Extension A, HAN
           0x20000 <= code <= 0x2A6DF or \    # General Punctuation
           0x2A700 <= code <= 0x2B73F or \    # Supplemental Mathematical Operators
           0x2B740 <= code <= 0x2B81F or \    # Miscellaneous Symbols and Arrows
           0x2B820 <= code <= 0x2CEAF or \    # Miscellaneous Symbols and Arrows
           0xF900 <= code <= 0xFAFF or \      # CJK Compatibility Ideographs, HAN
           0x2F800 <= code <= 0x2FA1F         # CJK Compatibility Ideographs Supplement, HAN

SyntaxError: unexpected character after line continuation character (<ipython-input-101-c87af0d1a94e>, line 6)

In [93]:
"""分词器
"""

text = "a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎是特殊Mn字符～ jerry 中国USTC"
text = text.lower()
print(f'Origin: {text}')

# 由于存在Mn字符等一些特殊字符，先规范化normalize，NFD规范分解形式
text = unicodedata.normalize('NFD', text)
print(f'Normalize: {text}')

# 删除Mn字符
text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
print(f'Text: {text}')

# 

Origin: a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎是特殊mn字符～ jerry 中国ustc
Normalize: a๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎๎是特殊mn字符～ jerry 中国ustc
Text: a是特殊mn字符～ jerry 中国ustc


In [97]:
ord('万') <= 0x9fff

True