forked from JoeSham/text_summarizer_czech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
separator.py
186 lines (141 loc) · 8.25 KB
/
separator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Separates text to sentences
Written by Petr Machovec
"""
import os
def separate(input_string):
file_not_found = False
os.chdir(os.path.dirname(os.path.realpath(__file__)))
message = ""
# abbreviations set - common czech abbreviations:
try:
with open("separator_data/abbreviations.txt", 'r') as abbreviations_file:
abbreviations = frozenset(line.strip() for line in abbreviations_file)
except IOError:
message += "Soubor abbreviations.txt nenalezen"
file_not_found = True
# separators set - symbols that separate sentences:
try:
with open("separator_data/separators.txt", 'r') as separators_file:
separators = frozenset(line.strip() for line in separators_file)
except IOError:
message += "; Soubor separators.txt nenalezen"
file_not_found = True
# starters set - symbols that can appear at the beginning of a sentence:
try:
with open("separator_data/starters.txt", 'r') as starters_file:
starters = frozenset(line.strip() for line in starters_file)
except IOError:
message += "; Soubor starters.txt nenalezen"
file_not_found = True
# terminators set - symbols that can appear at the end of a sentence (after a separator)
try:
with open("separator_data/terminators.txt", 'r') as terminators_file:
terminators = frozenset(line.strip() for line in terminators_file)
except IOError:
message += "; Soubor terminators.txt nenalezen"
file_not_found = True
if file_not_found:
message = message.strip(";").strip()
raise IOError(message)
input_string = input_string.strip()
sentences = list()
begin = 0
end = 0
help_begin = 0
help_end = 0
sep_pos = 0
help_string = ""
make_sentence = False
upper = False
# Big while-cycle reading the whole input_string char after char and performing all the magic
while end < len(input_string):
# New line - end of a paragraph
if input_string[end] == '\n':
sentence = input_string[begin:end].strip()
if len(sentence) > 0:
sentences.append(sentence)
begin = end+1
# The last word of the paragraph can be a sign (one word with small letter at the beginning),
# this must be checked, but only if the sentence was really added (i.e. if it's length is bigger than 0)
if len(sentence) > 0:
help_begin = end-1
help_end = end-1
# Moving help_end to the end of the paragraph text
while input_string[help_end].isspace():
help_end -= 1
# Text of the paragraph is not finished by a separator, there can be a sign
if not input_string[help_end] in separators:
help_begin = help_end
# Moving help_begin before the beginning of the last word before the new line (possible sign)
while help_begin >= 0 and not input_string[help_begin].isspace():
help_begin -= 1
sign = input_string[help_begin+1:help_end+1] #Last word of the paragraph - possible sign
if sign[0].islower(): #First char of the possible sign is lower - it was not separated as a sentence before
while (input_string[help_begin].isspace()): #Moving help_begin to the end of the text before the possible sign
help_begin -= 1
if input_string[help_begin] in separators: #There is a separator before the possible sign - it really is a sign and must be separated
sentences.pop()
sentence = sentence[0:(len(sentence)-len(sign))].strip()
sentences.append(sentence)
sentences.append(sign)
elif input_string[end] in separators: #Sentence separating char (separator) was detected, it depends what follows in the text
sep_pos = end
while (end < len(input_string)-1 and
input_string[end+1] in terminators): #Skipping terminators
end += 1
help_end = end+1
make_sentence = False
while (help_end < len(input_string) and
(input_string[help_end].isspace() or (input_string[help_end] in starters)) and
input_string[help_end] != '\n'):
help_end += 1 #Moves help_end to the first 'sentence-begin-deciding' char behind the separator (starters act like whitespaces, but they are not trimmed when at the beginning of a sentence)
if help_end >= len(input_string): #There are only whitespaces or starters mesh after the separator - end of the text
sentence = input_string[begin:end+1].strip()
if len(sentence) > 0:
sentences.append(sentence)
end = help_end-1
elif input_string[help_end] == '\n': #There is a new line after the separator - will be solved in next round
end = help_end-1
elif (input_string[help_end].isupper() or
input_string[help_end].isdigit()): #There is an upper char or digit after the separator
upper = input_string[help_end].isupper()
if input_string[end] != '.': #The separator is not a dot, it is the end of the sentence
make_sentence = True
else: #The separator is a dot, it can be the end of an abbreviation or a part of an order number
help_begin = sep_pos-1
help_end = sep_pos-1
while (input_string[help_end].isspace()): #Skipping whitespaces before the dot
help_begin -= 1
help_end -=1
while (help_begin >= 0 and
not input_string[help_begin].isspace() and
input_string[help_begin] != '.'): #Moving help_begin to the beginning of the word before the dot
help_begin -= 1
help_begin += 1
#The word before the dot is to be extracted, it can start with any of the starters and these must be ommited
while (help_begin < help_end and input_string[help_begin] in starters):
help_begin += 1
help_string = input_string[help_begin:help_end+1] #The word before the dot
if ((len(help_string) != 1 or help_string.isdigit() or help_string in terminators) and
not help_string.lower() in abbreviations): #The word before the dot is not an abbreviation
if upper: #There is an upper char after the dot, all prerequisities to make a sentence are satisfied
make_sentence = True
elif (len(help_string) > 0 and
not help_string[len(help_string)-1].isdigit()): #There is a digit after the dot, the word before the dot cannot end with a digit to make a sentence
make_sentence = True
if make_sentence:
sentence = input_string[begin:end+1].strip()
if len(sentence) > 0:
sentences.append(sentence)
begin = end+1
end += 1 #End of the big while-cycle
help_end = end-1 #When the whole text is not ended by a separator, last sentence is not included. This must be solved separately.
while (help_end >= 0 and
(input_string[help_end] in terminators or input_string[help_end].isspace())):
help_end -= 1
if help_end >= 0 and not input_string[help_end] in separators:
sentence = input_string[begin:end].strip()
if len(sentence) > 0:
sentences.append(sentence)
return sentences