-
Notifications
You must be signed in to change notification settings - Fork 0
/
kurdish-words-corrector.py
188 lines (151 loc) · 8.15 KB
/
kurdish-words-corrector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import json
from itertools import product
import argparse
import re
from multiprocessing.pool import ThreadPool
parser = argparse.ArgumentParser(description='Correct Kurdish words especially the ones that went through wrong '
'unicode settings')
parser.add_argument('-w', '--word', dest='word', type=str, help='The word that you want to correct it')
parser.add_argument('-t', '--text', dest='text', type=str, help='The text to correct its words')
parser.add_argument('-f', '--file', dest='file', type=str, help='The file path to correct words from its text')
parser.add_argument('-o', '--output', dest='output', type=str, help='The output file, if you want to save the results')
parser.add_argument('-d', '--depth', dest='depth', type=int, help='values from 1-3 depend on how depth (then slow) '
'you want to correct, 1 is the fastest and the lest'
' depth, 3 is the slowest the most depth')
parser.add_argument('-p', '--parser', dest='parser', type=str, help='Parse the output file (json or yaml), default=yaml')
parser.add_argument('-wr', '--workers', dest='workers', type=int, help='The number of workers (threads), default=100')
args = parser.parse_args()
common_mistakes_dict_1 = {'e': ['ê'], 'u': ['û'], 'i': ['î'], 's': ['ş'], 'c': ['ç'], 'w': ['v'],
'ê': ['e'], 'û': ['u'], 'î': ['i'], 'ş': ['s'], 'ç': ['c'], 'v': ['w']}
common_mistakes_dict_2 = {'e': ['ê'], 'u': ['û', 'o', 'w'], 'i': ['î'], 's': ['ş'], 'c': ['ç'], 'w': ['v', 'o'],
'ê': ['e'], 'û': ['u', 'o', 'w'], 'î': ['i'], 'ş': ['s'], 'ç': ['c'], 'v': ['w'],
'o': ['u', 'û', 'w']}
common_mistakes_dict_3 = {'e': ['ê', 'i'], 'u': ['û', 'o', 'w'], 'i': ['î', 'e'], 's': ['ş'], 'c': ['ç'],
'w': ['v', 'o'],
'ê': ['e', 'î'], 'û': ['u', 'o', 'w'], 'î': ['i', 'ê'], 'ş': ['s'], 'ç': ['c'], 'v': ['w'],
'o': ['u', 'û', 'w']}
f = open('correct_words.txt', 'r', encoding='utf-8')
correct_words = f.read().split('\n')
f.close()
def correct_word(word, depth=1):
if depth == 3:
common_mistakes_dict = common_mistakes_dict_3
elif depth == 2:
common_mistakes_dict = common_mistakes_dict_2
else:
common_mistakes_dict = common_mistakes_dict_1
if word.lower() in correct_words:
return {'word': word, 'message': 'Is a correct word', 'status': 0}
else:
for key in common_mistakes_dict.keys():
if key not in common_mistakes_dict[key]:
common_mistakes_dict[key].append(key)
res = []
for sub in [zip(common_mistakes_dict.keys(), chr) for chr in product(*common_mistakes_dict.values())]:
temp = word.lower()
for repls in sub:
temp = temp.replace(*repls)
res.append(temp)
combinations = set(res)
possibilities = []
for combination in combinations:
if combination in correct_words:
possibilities.append(combination)
if len(possibilities) == 0:
return {'word': word, 'message': 'Is not in our database, and we didn\'t find a similar word',
'status': 2}
return {'word': word, 'message': 'Is not in our database, and we found similar word/s', 'status': 1,
'possibilities': possibilities}
def correct_text(text, output_path=None, depth=1, parser="yaml", workers=100):
all_words = list(filter(None, __split_text(text).strip().split("\n")))
output = __get_output(all_words, depth, workers)
if output_path is None:
return str(output)
else:
output_states_path = '-states.'.join(output_path.split('.', 1))
if not output['incorrect_words_with_possible_corrections']:
if parser == "json":
output = json.dumps(output)
__save_to_file(text, output_path)
__save_to_file(str(output), output_states_path)
else:
for item in output['incorrect_words_with_possible_corrections']:
word = item['word']
replacer = item['possibilities'][0]
if word[0].isupper():
replacer = replacer.capitalize()
text = text.replace(word, replacer, 1)
if parser == "json":
output = json.dumps(output)
__save_to_file(text, output_path)
__save_to_file(str(output), output_states_path)
def correct_file(file_path, output_path=None, depth=1, parser="yaml", workers=100):
text = __read_from_file(file_path)
return correct_text(text, output_path, depth, parser, workers)
def __get_output(all_words, depth=1, workers=100):
output = dict()
results = []
current_correct_words = []
incorrect_words_with_possible_corrections = []
incorrect_words_without_possible_corrections = []
pool = ThreadPool(workers)
for word in all_words:
results.append(pool.apply_async(correct_word, args=(word, depth)))
pool.close()
pool.join()
results = [r.get() for r in results]
for item in results:
if item['status'] == 0:
current_correct_words.append(item)
elif item['status'] == 1:
incorrect_words_with_possible_corrections.append(item)
elif item['status'] == 2:
incorrect_words_without_possible_corrections.append(item)
output['correct_words'] = current_correct_words
output['incorrect_words_with_possible_corrections'] = incorrect_words_with_possible_corrections
output['incorrect_words_without_possible_corrections'] = incorrect_words_without_possible_corrections
output['total_words'] = len(all_words)
output['total_incorrect'] = len(incorrect_words_with_possible_corrections) + 0.5 * len(
incorrect_words_without_possible_corrections)
output['total_incorrect_with_corrections'] = len(incorrect_words_with_possible_corrections)
output['total_incorrect_without_corrections'] = len(incorrect_words_without_possible_corrections)
output['incorrect_percentage'] = (output['total_incorrect'] * 100) / (output['total_words'])
return output
def __save_to_file(text, file_path):
file = open(file_path, "w", encoding="UTF-8")
file.write(text)
file.close()
def __read_from_file(file_path):
file = open(file_path, 'r', encoding='UTF-8')
result = file.read()
file.close()
return result
def __split_text(text, delimiter="\n"):
kurdish_letters = "ABCÇDEÊFGHIÎJKLMNOPQRSŞTUÛVWXYZabcçdeêfghiîjklmnopqrsştuûvwxyz"
reg = "[^" + kurdish_letters + "]+"
text = re.sub(reg, '*', text).rstrip()
return delimiter.join(text.split('*'))
if __name__ == "__main__":
depth = 1
if args.depth is not None:
depth = args.depth
workers = 100
if args.workers is not None:
workers = args.workers
parser = "yaml"
if args.parser is not None and (args.parser.lower() == "yaml" or args.parser.lower() == "json"):
parser = args.parser
if args.word is not None and len(args.word.strip()) > 0:
print(correct_word(args.word, depth=depth))
elif args.text is not None and len(args.text.strip()) > 0 and args.output is None:
print(correct_text(args.text, depth=depth, parser=parser, workers=workers))
elif args.text is not None and len(args.text.strip()) > 0 and args.output is not None:
correct_text(args.text, output_path=args.output, depth=depth, parser=parser, workers=workers)
print("Success:", "Corrected text has been saved saved to", args.output)
elif args.file is not None and len(args.file.strip()) > 0 and args.output is None:
print(correct_file(args.file, depth=depth, parser=parser, workers=workers))
elif args.file is not None and len(args.file.strip()) > 0 and args.output is not None:
correct_file(args.file, output_path=args.output, depth=depth, parser=parser, workers=workers)
print("Success:", "Corrected text has been saved saved to", args.output)
else:
print("Run 'python kurdish-words-corrector.py -h to find the usage'")