-
Notifications
You must be signed in to change notification settings - Fork 1
/
util_text.py
133 lines (78 loc) · 2.94 KB
/
util_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
__author__ = 'jwchung'
import re
from xml.sax.saxutils import escape as xml_escape, quoteattr as xml_quoteattr
def gen_quote_text(text):
# convert the <, &, and > characters
text = xml_escape(text)
text = xml_quoteattr(text)
return text
def get_unicode_text(text):
return unicode(text, 'utf8')
def get_alternative_text_for_letter_counting(text):
return get_unicode_text(text.replace('…', '.'))
def get_text_fragment_with_letter_idx(text, begin=0, end=0):
"""
:param text:
:param begin:
:param end:
:return:
"""
#text_unicode = get_unicode_text(text)
text_unicode = get_alternative_text_for_letter_counting(text)
if begin > 0 and end == 0:
end = len(text_unicode)
return text_unicode[begin:end].encode('utf8')
def count_num_letters(text):
return len(get_alternative_text_for_letter_counting(text))
def count_num_words(text):
return len(get_word_list(text.strip()))
def get_letter_list(raw_text):
raw_text_unicode = get_unicode_text(raw_text)
letter_list = list(raw_text_unicode)
return [letter.encode('utf8') for letter in letter_list]
def get_word_list(raw_text):
return raw_text.split()
def get_sent_list(raw_text):
raw_text = re.sub(r'[\r\n]+', '\n', raw_text)
return raw_text.split('\n')
def remove_all_tags_from_anno_text(anno_text):
return re.sub(r'<.+?>', r'', anno_text, flags=re.DOTALL)
def get_num_of_preceding_letters(text_part, whole_text):
char_idx = whole_text.find(text_part)
prec_text = whole_text[:char_idx]
raw_prec_text = remove_all_tags_from_anno_text(prec_text)
return count_num_letters(raw_prec_text)
def get_num_of_preceding_words(text_part, whole_text):
char_idx = whole_text.find(text_part)
prec_text = whole_text[:char_idx]
raw_prec_text = remove_all_tags_from_anno_text(prec_text)
if len(raw_prec_text) > 0 and re.match('\s', raw_prec_text[-1]):
return count_num_words(raw_prec_text)
elif len(raw_prec_text) == 0:
return 0
else:
return count_num_words(raw_prec_text) - 1
def get_num_of_preceding_sents(text_part, whole_text):
char_idx = whole_text.find(text_part)
prec_text = whole_text[:char_idx]
# raw_prec_text = re.sub(r'<.+?>', '', prec_text)
raw_prec_text = remove_all_tags_from_anno_text(prec_text)
raw_prec_text = re.sub(r'[\r\n]+', '\n', raw_prec_text)
return raw_prec_text.count('\n')
def get_num_of_preceding_words_in_sent(text_part, whole_text):
char_idx = whole_text.find(text_part)
prec_text = whole_text[:char_idx]
raw_prec_text = remove_all_tags_from_anno_text(prec_text)
leading_text_in_sent = raw_prec_text.split('\n')[-1].lstrip()
if not leading_text_in_sent:
return 0
elif re.match('\s', leading_text_in_sent[-1]):
return count_num_words(leading_text_in_sent)
else:
return count_num_words(leading_text_in_sent) - 1
def extract_field_value_pairs_from_attr_span(attr_span):
pat1 = r'(\S+)\s*=\s*"(.*?)"'
pat2 = r'(\S+)\s*=\s*([^">=\s]+)'
result1 = re.findall(pat1, attr_span)
result2 = re.findall(pat2, attr_span)
return result1 + result2