-
Notifications
You must be signed in to change notification settings - Fork 14
/
trans.py
151 lines (118 loc) · 3.74 KB
/
trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gc,re, csv, pickle, enchant
from transNNP import handleNewToken, freqTokens
import pandas as pd
# from anuvaad import Anuvaad
# anu =Anuvaad('english-telugu')
from deeptranslit import DeepTranslit
translit = DeepTranslit('telugu').transliterate
threshold = 0.6
# Checks if an attribute value is valid
def is_valid(value):
if isinstance(value, list):
return len(value) > 0
if isinstance(value, bool):
return value
if (value == None) or (pd.isnull(value)) or \
(str(value) in ["[]", '', "None", 'none', 'N/A', 'n/a', 'Not Applicable', 'not applicable', 'nan',
'Others', 'others', 'No Boundary Wall', 'no boundary wall', 'No Building', 'no building',
'Unrecognised', 'unrecognised']):
return False
if isinstance(value, float) or isinstance(value, int):
return value > 0 and str(value) != 'nan'
return not value in ['', 'nan', '-1']
def clean(token):
cleanToken = ''
for c in token:
if not re.match(r'[\[\]\(\),]', c):
cleanToken+=c
return cleanToken
def masterHandleTitle(title):
#Pre process
if not is_valid(title):
return title
title = title.upper()
title = re.sub('&', '&', title)
title = re.sub(r'(\(\[)', ' \g<1>', title)
title = re.sub(r'(\)\])', '\g<1> ', title)
title = re.sub(r'^((MP|ZP|G|GA)(PS|PPS|HS|UPS)|APMS)', '\g<1> ', title)
title =re.sub('([a-zA-Z])([0-9])', '\g<1> \g<2>', title)
title =re.sub('([0-9])([a-zA-Z])', '\g<1> \g<2>', title)
# Translation/Transliteration
teTitle = ''
for token in re.split(',+| +', title):
cleanToken =clean(token)
withoutDot =re.sub('\.', '', cleanToken)
# Most Frequent & Imp Tokens
if cleanToken in freqTokens:
teToken =freqTokens[cleanToken]
elif withoutDot in freqTokens:
teToken =freqTokens[withoutDot]
# New Token
else:
teToken =handleNewToken(cleanToken, translit)
try:
if cleanToken!=token and cleanToken in token:
teToken = re.sub(cleanToken, teToken, token)
except:
teToken =teToken
teTitle +=teToken+' '
#Post Processing
teTitle = re.sub('&', '&', teTitle)
teTitle = re.sub(r'([\(\[]) ', r'\g<1>', teTitle)
teTitle = re.sub(r' ([\)\]])', r'\g<1>', teTitle)
teTitle = re.sub(r'\s+', ' ', teTitle)
return teTitle.strip()
def process(phrase):
phr =''
for word in phrase.split():
if '_' in word:
token =''
parts = word.split('_')
for p in parts:
token+=p+' '
word = token
word = re.sub('\(', '( ', word)
word = re.sub('\)', ') ', word)
phr+=word+' '
return phr.strip()
def handleExceptions(pred, anuTelugu):
#Take care of అమ్మాయిలు and అబ్బాయిలు
pred =re.sub('అబ్బాయిలు', 'బాలురు', pred)
pred =re.sub('అమ్మాయిలు', 'బాలికలు', pred)
anuTelugu =re.sub('అబ్బాయిలు', 'బాలురు', anuTelugu)
anuTelugu =re.sub('అమ్మాయిలు', 'బాలికలు', anuTelugu)
return pred, anuTelugu
def translate(phrase):
telugu = ''
for word in phrase.split():
if '.' in word:
abbr = ''
chars = word.split('.')
for char in chars:
abbr+=anu.anuvaad(char)+'.'
telugu+=abbr[:-1]+' '
else:
telugu += anu.anuvaad(word)+' '
return telugu.strip()
def transTelugu(phrase):
if not is_valid(phrase):
return 'nan'
if phrase == 'others':
return 'nan'
phrase = process(phrase)
# anuTelugu =translate(phrase)
anuTelugu = ''
deep =translit(phrase)[0]
pred =deep['pred']
prob =float(deep['prob'])
pred, anuTelugu =handleExceptions(pred, anuTelugu)
if prob >= threshold:
return pred.strip()+' '
else:
# return anuTelugu.strip()+' '
return pred.strip()+' '
def main():
print(translit('anantapur'))
# print(anu.anuvaad('anantapur'))
if __name__ == "__main__":
main()