forked from tongchangD/bert_for_corrector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utils.py
190 lines (153 loc) · 5.13 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
# Brief: 汉字处理的工具:判断unicode是否是汉字,数字,英文,或者其他字符。以及全角符号转半角符号。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import pypinyin
import six
from pypinyin import pinyin
from langconv import Converter
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if '\u4e00' <= uchar <= '\u9fa5':
return True
else:
return False
def is_chinese_string(string):
"""判断是否全为汉字"""
for c in string:
if not is_chinese(c):
return False
return True
def is_number(uchar):
"""判断一个unicode是否是数字"""
if u'u0030' <= uchar <= u'u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (u'u0041' <= uchar <= u'u005a') or (u'u0061' <= uchar <= u'u007a'):
return True
else:
return False
def is_alphabet_string(string):
"""判断是否全部为英文字母"""
for c in string:
if c < 'a' or c > 'z':
return False
return True
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def B2Q(uchar):
"""半角转全角"""
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020: # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)
def Q2B(uchar):
"""全角转半角"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
return uchar
return chr(inside_code)
def stringQ2B(ustring):
"""把字符串全角转半角"""
return "".join([Q2B(uchar) for uchar in ustring])
def uniform(ustring):
"""格式化字符串,完成全角转半角,大写转小写的工作"""
return stringQ2B(ustring).lower()
def remove_punctuation(strs):
"""
去除标点符号
:param strs:
:return:
"""
return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", strs.strip())
def traditional2simplified(sentence):
"""
将sentence中的繁体字转为简体字
:param sentence: 待转换的句子
:return: 将句子中繁体字转换为简体字之后的句子
"""
sentence = Converter('zh-hans').convert(sentence)
return sentence
def simplified2traditional(sentence):
"""
将sentence中的简体字转为繁体字
:param sentence: 待转换的句子
:return: 将句子中简体字转换为繁体字之后的句子
"""
sentence = Converter('zh-hant').convert(sentence)
return sentence
def get_homophones_by_char(input_char):
"""
根据汉字取同音字
:param input_char:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
result.append(chr(i))
return result
def get_homophones_by_pinyin(input_pinyin):
"""
根据拼音取同音字
:param input_pinyin:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
# TONE2: 中zho1ng
result.append(chr(i))
return result
if __name__ == "__main__":
a = 'nihao'
print(a, is_alphabet_string(a))
# test Q2B and B2Q
for i in range(0x0020, 0x007F):
print(Q2B(B2Q(chr(i))), B2Q(chr(i)))
# test uniform
ustring = '中国 人名a高频A 扇'
ustring = uniform(ustring)
print(ustring)
print(is_other(','))
print(uniform('你干么!d7&888学英 语ABC?nz'))
print(is_chinese('喜'))
print(is_chinese_string('喜,'))
print(is_chinese_string('丽,'))