Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
177 lines (158 sloc) 4.77 KB
/*
* The UTF-8 functions were Copied from
* https://github.com/rustyrussell/ccan/blob/master/ccan/charset/charset.c
*
* Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
* All rights reserved.
*
* LICENCE: BSD-MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#include <dirent.h>
/*
* Validate a single UTF-8 character.
* @s: Beginning of UTF-8 character.
* @e: End of string.
*
* If it's valid, return its length (1 thru 4).
* If it's invalid or clipped, return 0.
*
* This function implements the syntax given in RFC3629, which is
* the same as that given in The Unicode Standard, Version 6.0.
*
* It has the following properties:
*
* * All codepoints U+0000..U+10FFFF may be encoded,
* except for U+D800..U+DFFF, which are reserved
* for UTF-16 surrogate pair encoding.
* * UTF-8 byte sequences longer than 4 bytes are not permitted,
* as they exceed the range of Unicode.
* * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/
int utf8_validate_char(const char *s, const char *e)
{
unsigned char c = *s++;
if (c <= 0x7F) { /* 00..7F */
return 1;
} else if (c <= 0xC1) { /* 80..C1 */
/* Disallow overlong 2-byte sequence. */
return 0;
} else if (c <= 0xDF) { /* C2..DF */
/* Make sure the character isn't clipped. */
if (e - s < 1)
return 0;
/* Make sure subsequent byte is in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
return 2;
} else if (c <= 0xEF) { /* E0..EF */
/* Make sure the character isn't clipped. */
if (e - s < 2)
return 0;
/* Disallow overlong 3-byte sequence. */
if (c == 0xE0 && (unsigned char)*s < 0xA0)
return 0;
/* Disallow U+D800..U+DFFF. */
if (c == 0xED && (unsigned char)*s > 0x9F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
return 3;
} else if (c <= 0xF4) { /* F0..F4 */
/* Make sure the character isn't clipped. */
if (e - s < 3)
return 0;
/* Disallow overlong 4-byte sequence. */
if (c == 0xF0 && (unsigned char)*s < 0x90)
return 0;
/* Disallow codepoints beyond U+10FFFF. */
if (c == 0xF4 && (unsigned char)*s > 0x8F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
return 4;
} else { /* F5..FF */
return 0;
}
}
/*
* Validate the given UTF-8 string.
* If it contains '\0' characters, it is still valid.
*
* Read a single UTF-8 character starting at @s,
* returning the length, in bytes, of the character read.
*
* This function assumes input is valid UTF-8,
* and that there are enough characters in front of @s.
*/
bool utf8_validate(const char *str, size_t length)
{
const char *s = str;
const char *e = str + length;
int len;
for (; s < e; s += len) {
len = utf8_validate_char(s, e);
if (len == 0)
return false;
}
assert(s == e);
return true;
}
int main(int argc, char **argv)
{
struct dirent *entry;
DIR *dp;
int ret = 1;
if (argc < 2) {
fprintf(stderr, "usage %s <directory>\n", argv[0]);
exit(1);
}
dp = opendir(argv[1]);
if (!dp) {
fprintf(stderr, "fatal: cannot open directory '%s'\n", argv[1]);
exit(1);
}
while ((entry = readdir(dp))) {
if (!utf8_validate(entry->d_name, strlen(entry->d_name))) {
fprintf(stderr, "fatal: filename '%s' is not valid UTF-8\n",
entry->d_name);
goto out;
}
}
ret = 0;
out:
closedir(dp);
return ret;
}
You can’t perform that action at this time.