Skip to content

Commit

Permalink
convert: add round trip check based on 'core.checkRoundtripEncoding'
Browse files Browse the repository at this point in the history
UTF supports lossless conversion round tripping and conversions between
UTF and other encodings are mostly round trip safe as Unicode aims to be
a superset of all other character encodings. However, certain encodings
(e.g. SHIFT-JIS) are known to have round trip issues [1].

Add 'core.checkRoundtripEncoding', which contains a comma separated
list of encodings, to define for what encodings Git should check the
conversion round trip if they are used in the 'working-tree-encoding'
attribute.

Set SHIFT-JIS as default value for 'core.checkRoundtripEncoding'.

[1] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
  • Loading branch information
larsxschneider committed Feb 24, 2018
1 parent 30f04bc commit 2758a2d
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Documentation/config.txt
Expand Up @@ -530,6 +530,12 @@ core.autocrlf::
This variable can be set to 'input',
in which case no output conversion is performed.

core.checkRoundtripEncoding::
A comma separated list of encodings that Git performs UTF-8 round
trip checks on if they are used in an `working-tree-encoding`
attribute (see linkgit:gitattributes[5]). The default value is
`SHIFT-JIS`.

core.symlinks::
If false, symbolic links are checked out as small plain files that
contain the link text. linkgit:git-update-index[1] and
Expand Down
8 changes: 8 additions & 0 deletions Documentation/gitattributes.txt
Expand Up @@ -310,6 +310,14 @@ number of pitfalls:
internal contents as UTF-8 and try to convert it to UTF-16 on checkout.
That operation will fail and cause an error.

- Reencoding content to non-UTF encodings can cause errors as the
conversion might not be UTF-8 round trip safe. If you suspect your
encoding to not be round trip safe, then add it to
`core.checkRoundtripEncoding` to make Git check the round trip
encoding (see linkgit:git-config[1]). SHIFT-JIS (Japanese character
set) is known to have round trip issues with UTF-8 and is checked by
default.

- Reencoding content requires resources that might slow down certain
Git operations (e.g 'git checkout' or 'git add').

Expand Down
5 changes: 5 additions & 0 deletions config.c
Expand Up @@ -1172,6 +1172,11 @@ static int git_default_core_config(const char *var, const char *value)
return 0;
}

if (!strcmp(var, "core.checkroundtripencoding")) {
check_roundtrip_encoding = xstrdup(value);
return 0;
}

if (!strcmp(var, "core.notesref")) {
notes_ref_name = xstrdup(value);
return 0;
Expand Down
74 changes: 74 additions & 0 deletions convert.c
Expand Up @@ -289,6 +289,39 @@ static void trace_encoding(const char *context, const char *path,
strbuf_release(&trace);
}

static int check_roundtrip(const char* enc_name)
{
/*
* check_roundtrip_encoding contains a string of space and/or
* comma separated encodings (eg. "UTF-16, ASCII, CP1125").
* Search for the given encoding in that string.
*/
const char *found = strcasestr(check_roundtrip_encoding, enc_name);
const char *next = found + strlen(enc_name);
int len = strlen(check_roundtrip_encoding);
return (found && (
/*
* check that the found encoding is at the
* beginning of check_roundtrip_encoding or
* that it is prefixed with a space or comma
*/
found == check_roundtrip_encoding || (
found > check_roundtrip_encoding &&
(*(found-1) == ' ' || *(found-1) == ',')
)
) && (
/*
* check that the found encoding is at the
* end of check_roundtrip_encoding or
* that it is suffixed with a space or comma
*/
next == check_roundtrip_encoding + len || (
next < check_roundtrip_encoding + len &&
(*next == ' ' || *next == ',')
)
));
}

static struct encoding {
const char *name;
struct encoding *next;
Expand Down Expand Up @@ -366,6 +399,47 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
}
trace_encoding("destination", path, default_encoding, dst, dst_len);

/*
* UTF supports lossless conversion round tripping [1] and conversions
* between UTF and other encodings are mostly round trip safe as
* Unicode aims to be a superset of all other character encodings.
* However, certain encodings (e.g. SHIFT-JIS) are known to have round
* trip issues [2]. Check the round trip conversion for all encodings
* listed in core.checkRoundtripEncoding.
*
* The round trip check is only performed if content is written to Git.
* This ensures that no information is lost during conversion to/from
* the internal UTF-8 representation.
*
* Please note, the code below is not tested because I was not able to
* generate a faulty round trip without an iconv error. Iconv errors
* are already caught above.
*
* [1] http://unicode.org/faq/utf_bom.html#gen2
* [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode
*/
if ((conv_flags & CONV_WRITE_OBJECT) && check_roundtrip(enc->name)) {
char *re_src;
int re_src_len;

re_src = reencode_string_len(dst, dst_len,
enc->name, default_encoding,
&re_src_len);

trace_printf("Checking roundtrip encoding for %s...\n", enc->name);
trace_encoding("reencoded source", path, enc->name,
re_src, re_src_len);

if (!re_src || src_len != re_src_len ||
memcmp(src, re_src, src_len)) {
const char* msg = _("encoding '%s' from %s to %s and "
"back is not the same");
die(msg, path, enc->name, default_encoding);
}

free(re_src);
}

strbuf_attach(buf, dst, dst_len, dst_len + 1);
return 1;
}
Expand Down
1 change: 1 addition & 0 deletions convert.h
Expand Up @@ -56,6 +56,7 @@ struct delayed_checkout {
};

extern enum eol core_eol;
extern char *check_roundtrip_encoding;
extern const char *get_cached_convert_stats_ascii(const struct index_state *istate,
const char *path);
extern const char *get_wt_convert_stats_ascii(const char *path);
Expand Down
1 change: 1 addition & 0 deletions environment.c
Expand Up @@ -50,6 +50,7 @@ int check_replace_refs = 1;
char *git_replace_ref_base;
enum eol core_eol = EOL_UNSET;
int global_conv_flags_eol = CONV_EOL_RNDTRP_WARN;
char *check_roundtrip_encoding = "SHIFT-JIS";
unsigned whitespace_rule_cfg = WS_DEFAULT_RULE;
enum branch_track git_branch_track = BRANCH_TRACK_REMOTE;
enum rebase_setup_type autorebase = AUTOREBASE_NEVER;
Expand Down
41 changes: 41 additions & 0 deletions t/t0028-working-tree-encoding.sh
Expand Up @@ -225,4 +225,45 @@ test_expect_success 'error if encoding garbage is already in Git' '
git reset --hard $BEFORE_STATE
'

test_expect_success 'check roundtrip encoding' '
text="hallo there!\nroundtrip test here!" &&
printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
# SHIFT-JIS encoded files are round-trip checked by default...
GIT_TRACE=1 git add .gitattributes roundtrip.shift 2>&1 >/dev/null |
grep "Checking roundtrip encoding for SHIFT-JIS" &&
git reset &&
# ... unless we overwrite the Git config!
test_config core.checkRoundtripEncoding "garbage" &&
! GIT_TRACE=1 git add .gitattributes roundtrip.shift 2>&1 >/dev/null |
grep "Checking roundtrip encoding for SHIFT-JIS" &&
test_unconfig core.checkRoundtripEncoding &&
git reset &&
# UTF-16 encoded files should not be round-trip checked by default...
! GIT_TRACE=1 git add roundtrip.utf16 2>&1 >/dev/null |
grep "Checking roundtrip encoding for UTF-16" &&
git reset &&
# ... unless we tell Git to check it!
test_config_global core.checkRoundtripEncoding "UTF-16, UTF-32" &&
GIT_TRACE=1 git add roundtrip.utf16 2>&1 >/dev/null |
grep "Checking roundtrip encoding for UTF-16" &&
git reset &&
# ... unless we tell Git to check it!
# (here we also check that the casing of the encoding is irrelevant)
test_config_global core.checkRoundtripEncoding "UTF-32, utf-16" &&
GIT_TRACE=1 git add roundtrip.utf16 2>&1 >/dev/null |
grep "Checking roundtrip encoding for UTF-16" &&
git reset &&
# cleanup
rm roundtrip.shift roundtrip.utf16 &&
git reset --hard HEAD
'

test_done

0 comments on commit 2758a2d

Please sign in to comment.