/
encoding.py
409 lines (356 loc) · 15.2 KB
/
encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Hive Administration Scripts
# Copyright (c) 2008-2019 Hive Solutions Lda.
#
# This file is part of Hive Administration Scripts.
#
# Hive Administration Scripts is free software: you can redistribute it and/or modify
# it under the terms of the Apache License as published by the Apache
# Foundation, either version 2.0 of the License, or (at your option) any
# later version.
#
# Hive Administration Scripts is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache License for more details.
#
# You should have received a copy of the Apache License along with
# Hive Administration Scripts. If not, see <http://www.apache.org/licenses/>.
__author__ = "Luís Martinho <lmartinho@hive.pt> & João Magalhães <joamag@hive.pt>"
""" The author(s) of the module """
__version__ = "1.0.0"
""" The version of the module """
__revision__ = "$LastChangedRevision$"
""" The revision number of the module """
__date__ = "$LastChangedDate$"
""" The last change date of the module """
__copyright__ = "Copyright (c) 2008-2019 Hive Solutions Lda."
""" The copyright for the module """
__license__ = "Apache License, Version 2.0"
""" The license for the module """
import os
import sys
import getopt
import legacy
import admin_scripts.extra as extra
USAGE_MESSAGE = "encoding path [-r]\
[-s source_encoding]\
[-t target_encoding]\
[-u]\
[-x replacement_from_1, replacement_to_1, replacement_from_2, replacement_to_2, ...]\
[-e file_extension_1, file_extension_2, ...]\
[-w exclusion_1, exclusion_2, ...]\
[-c configuration_file]"
""" The usage message, to be printed when help is required
or when a command line error exists """
BOM_SEQUENCE = b"\xef\xbb\xbf"
""" The byte based sequence that defines the start of
an utf-8 bom encoded text file """
def has_encoding(string_buffer, encoding):
"""
Determines if the provided buffer is encoded in the provided encoding.
:type string_buffer: String
:param string_buffer: The string buffer.
:type encoding: String
:param encoding: The encoding against which to test the string buffer.
"""
# initializes the has encoding flag
has_encoding = None
try:
# tries to decode the provided buffer, using the specified encoding
# setting the has encoding flag to valid in case nothing fails
string_buffer.decode(encoding)
has_encoding = True
except Exception:
# sets the has encoding flag as false as there was an error while
# trying to decode the buffer with the requested encoding
has_encoding = False
# returns the has encoding flag
return has_encoding
def apply_replacements_list(string_buffer, replacements_list):
"""
Applies a list of replacements to the provided string buffer.
:type string_buffer: String
:param string_buffer: The string to which the replacements are to be applied to.
:type replacements_list: List
:param replacements_list: The list of replacements to apply.
"""
# iterates over all the replacements to be performed and applies
# each of them to the provided string "buffer"
for replacement in replacements_list:
# unpacks the replacement tuple so that the source and the target
# strings may be retrieved, and then performs the replacements
replacement_from, replacement_to = replacement
replacement_from = legacy.bytes(replacement_from)
replacement_to = legacy.bytes(replacement_to)
string_buffer = string_buffer.replace(replacement_from, replacement_to)
# returns the replaced string buffer
return string_buffer
def convert_encoding(
file_path,
source_encoding,
target_encoding,
windows_newline = True,
replacements_list = None
):
"""
Converts the encoding of the specified file.
:type file_path: String
:param file_path: The path to the file to have its encoding converted.
:type source_encoding: String
:param source_encoding: The encoding from which the file is to be converted from.
:type target_encoding: String
:param target_encoding: The encoding to which the file is to be converted.
:type windows_newline: bool
:param windows_newline: If the windows newline should be used.
:type replacements_list: List
:param replacements_list: The list of replacements to perform.
"""
# normalizes the file path and uses it as the path to
# open the reference to it (in reading mode)
file_path_normalized = extra.normalize_path(file_path)
file = open(file_path_normalized, "rb")
try:
# reads the complete string contents from the file and
# checks if the file already has the target encoding
string_value = file.read()
string_value = string_value.replace(b"\r\n", b"\n")
string_value = string_value.replace(b"\r", b"\n")
has_target_encoding = has_encoding(string_value, target_encoding)
# in case the retrieved string value starts with the bom
# (byte order mark) sequence it's removed as it's considered
# deprecated as a method of detecting utf encoding
if string_value.startswith(BOM_SEQUENCE):
string_value = string_value[len(BOM_SEQUENCE):]
# decodes the string value from the specified source encoding, this
# operation may fail as the source encoding may only be a guess on
# the true encoding of the file, the encodes the string value again
# in the target encoding for the file
string_value_decoded = not has_target_encoding and\
string_value.decode(source_encoding) or string_value
string_value_encoded = not has_target_encoding and\
string_value_decoded.encode(target_encoding) or string_value_decoded
# applies the replacements if they're requested to be applied
# so that the final string value is "normalized"
string_value_encoded_replaced = replacements_list and\
apply_replacements_list(string_value_encoded, replacements_list) or\
string_value_encoded
# applies the windows newline if specified, it does so by replacing
# the simple newline character with the windows specific newline
string_value_encoded_replaced = windows_newline and\
string_value_encoded_replaced.replace(b"\n", b"\r\n") or\
string_value_encoded_replaced
finally:
# closes the file for reading (as it's not longer required)
file.close()
# opens the file for writing then writes the file string value
# with the proper string values replaced and re-encoded into the
# target character encoding (as expected)
file = open(file_path_normalized, "wb")
try: file.write(string_value_encoded_replaced)
finally: file.close()
def convert_encoding_walker(arguments, directory_name, names):
"""
Walker method to be used by the path walker for the encoding conversion.
:type arguments: Tuple
:param arguments: The arguments tuple sent by the walker method.
:type directory_name: String
:param directory_name: The name of the current directory in the walk.
:type names: List
:param names: The list of names in the current directory.
"""
# unpacks the arguments tuple
source_encoding, target_encoding, windows_newline,\
replacements_list, file_extensions, file_exclusion = arguments
# tries to run the handle ignore operation for the current set of names and
# in case there's a processing returns the control flow immediately as no
# more handling is meant to occur for the current operation (ignored)
if extra.handle_ignore(names): return
# removes the complete set of names that are meant to be excluded from the
# current set names to be visit (avoid visiting them)
for exclusion in file_exclusion:
if not exclusion in names: continue
names.remove(exclusion)
# retrieves the valid names for the names list (removes directory entries)
valid_complete_names = [directory_name + "/" + name\
for name in names if not os.path.isdir(directory_name + "/" + name)]
# filters the names with non valid file extensions
valid_complete_names = [os.path.normpath(name) for name in valid_complete_names\
if file_extensions == None or os.path.split(name)[-1].split(".")[-1] in file_extensions]
# creates the string based value of the windows newline taking into
# account the boolean value of it
windows_newline_s = "windows newline" if windows_newline else "unix newline"
# iterates over all the valid complete names with extension filter
# to convert the respective file into the target encoding
for valid_complete_name in valid_complete_names:
# prints a message about the file that is not going to be converted
# into the proper target encoding as defined in the specification
extra.echo(
"Convert encoding in file: %s (%s to %s) (%s)" %\
(
valid_complete_name,
source_encoding,
target_encoding,
windows_newline_s
)
)
try:
# converts the encoding for the provided (path) name according to
# a set of defined options, for various reasons this operation may
# fail if such thing happens the operation is skipped
convert_encoding(
valid_complete_name,
source_encoding,
target_encoding,
windows_newline,
replacements_list
)
except Exception:
extra.warn(
"Failed converting encoding in file: %s (%s to %s)" %\
(
valid_complete_name,
source_encoding,
target_encoding
)
)
def convert_encoding_recursive(
directory_path,
source_encoding,
target_encoding,
windows_newline,
replacements_list = None,
file_extensions = None,
file_exclusion = None
):
"""
Converts the file encoding in recursive mode.
All the options are arguments to be passed to the
walker function.
:type directory_path: String
:param directory_path: The path to the (entry point) directory.
:type source_encoding: String
:param source_encoding: The source encoding from which the file is to be
converted.
:type target_encoding: String
:param target_encoding: The target encoding to which the file is to be
converted to.
:type replacements_list: List
:param replacements_list: The list of replacements to perform.
:type file_extensions: List
:param file_extensions: The list of file extensions to be used.
:type file_exclusion: List
:param file_exclusion: The list of file exclusion to be used.
"""
legacy.walk(
directory_path,
convert_encoding_walker,
(
source_encoding,
target_encoding,
windows_newline,
replacements_list,
file_extensions,
file_exclusion
)
)
def main():
"""
Main function used for the encoding conversion.
"""
# in case the number of arguments
# is not sufficient
if len(sys.argv) < 2:
# prints a series of messages about the command line
# error that occurred and then exits with an error
extra.echo("Invalid number of arguments")
extra.echo("Usage: " + USAGE_MESSAGE)
sys.exit(2)
# sets the default values for the parameters
path = sys.argv[1]
recursive = False
source_encoding = None
target_encoding = None
windows_newline = None
replacements_list = None
file_extensions = None
file_exclusion = None
configuration_file_path = None
try:
options, _arguments = getopt.getopt(sys.argv[2:], "rs:t:x:e:w:c:", [])
except getopt.GetoptError:
# prints a series of messages about the command line
# error that occurred and then exits with an error
extra.echo("Invalid number of arguments")
extra.echo("Usage: " + USAGE_MESSAGE)
sys.exit(2)
# iterates over all the options, retrieving the option
# and the value for each
for option, value in options:
if option == "-r":
recursive = True
elif option == "-s":
source_encoding = value
elif option == "-t":
target_encoding = value
elif option == "-u":
windows_newline = value
elif option == "-x":
replacements_list = [value.strip() for value in value.split(",")]
elif option == "-e":
file_extensions = [value.strip() for value in value.split(",")]
elif option == "-w":
file_exclusion = [value.strip() for value in value.split(",")]
elif option == "-c":
configuration_file_path = value
# retrieves the configurations from the command line arguments
configurations = extra.configuration(
file_path = configuration_file_path,
recursive = recursive,
source_encoding = source_encoding,
target_encoding = target_encoding,
windows_newline = windows_newline,
replacements_list = replacements_list,
file_extensions = file_extensions,
file_exclusion = file_exclusion
)
# iterates over all the configurations, executing them
for configuration in configurations:
# retrieves the configuration values
recursive = configuration["recursive"]
source_encoding = configuration["source_encoding"] or "utf-8"
target_encoding = configuration["target_encoding"] or "utf-8"
windows_newline = configuration["windows_newline"] or True
replacements_list = configuration["replacements_list"] or ()
file_extensions = configuration["file_extensions"] or ()
file_exclusion = configuration["file_exclusion"] or ()
# in case the recursive flag is set must converts the files
# using the recursive mode
if recursive:
convert_encoding_recursive(
path,
source_encoding,
target_encoding,
windows_newline,
replacements_list,
file_extensions,
file_exclusion
)
# otherwise it's a "normal" iteration, must converts the
# encoding (for only one file)
else:
convert_encoding(
path,
source_encoding,
target_encoding,
windows_newline,
replacements_list
)
# verifies if there were messages printed to the standard
# error output and if that's the case exits in error
sys.exit(1 if extra.has_errors() else 0)
if __name__ == "__main__":
main()
else:
__path__ = []