Skip to content

Commit

Permalink
new bom sequence removal
Browse files Browse the repository at this point in the history
  • Loading branch information
joamag committed Feb 24, 2016
1 parent bc7e4f5 commit 638d510
Showing 1 changed file with 16 additions and 6 deletions.
22 changes: 16 additions & 6 deletions src/admin_scripts/base/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
""" The usage message, to be printed when help is required
or when a command line error exists """

BOM_SEQUENCE = b"\xef\xbb\xbf"
""" The byte based sequence that defines the start of
an utf-8 bom encoded text file """

def has_encoding(string_buffer, encoding):
"""
Determines if the provided buffer is encoded in the provided encoding.
Expand Down Expand Up @@ -139,6 +143,12 @@ def convert_encoding(
string_value = string_value.replace(b"\r\n", b"\n")
has_target_encoding = has_encoding(string_value, target_encoding)

# in case the retrieved string value starts with the bom
# (byte order mark) sequence it's removed as it's considered
# deprecated as a method of detecting utf encoding
if string_value.startswith(BOM_SEQUENCE):
string_value = string_value[len(BOM_SEQUENCE):]

# decodes the string value from the specified source encoding, this
# operation may fail as the source encoding may only be a guess on
# the true encoding of the file, the encodes the string value again
Expand Down Expand Up @@ -213,9 +223,9 @@ def convert_encoding_walker(arguments, directory_name, names):
extra.echo(
"Convert encoding in file: %s (%s to %s)" %\
(
valid_complete_name,
source_encoding,
target_encoding
valid_complete_name,
source_encoding,
target_encoding
)
)

Expand All @@ -234,9 +244,9 @@ def convert_encoding_walker(arguments, directory_name, names):
extra.warn(
"Failed converting encoding in file: %s (%s to %s)" %\
(
valid_complete_name,
source_encoding,
target_encoding
valid_complete_name,
source_encoding,
target_encoding
)
)

Expand Down

0 comments on commit 638d510

Please sign in to comment.