Skip to content
Permalink
Browse files

Add support for 4 byte UTF-8 characters and stricter character checking

  • Loading branch information...
klondi committed Nov 24, 2014
1 parent 05edfa4 commit 8a7e892aeb519986d26b22ac1f8d95c58d94ae55
Showing with 22 additions and 2 deletions.
  1. +22 −2 src/util/misc.c
@@ -63,7 +63,7 @@ char* strip_white_space(char* string)
return string;
}

static int is_valid_utf8_str(const char* string, size_t length)
static int is_valid_utf8_str(const unsigned char* string, size_t length)
{
int expect = 0;
char div = 0;
@@ -82,12 +82,32 @@ static int is_valid_utf8_str(const char* string, size_t length)
{
if (string[pos] & 0x80)
{
for (div = 0x40; div > 0x10; div /= 2)
for (div = 0x40; div > 0x08; div /= 2)
{
if (string[pos] & div) expect++;
else break;
}
if ((string[pos] & div) || (pos+expect >= length)) return 0;
switch (expect) {
case 0:
return 0;
case 1:
/* Out of range */
if (string[pos] < 0xC2) return 0;
break;
case 2:
/* Out of range */
if ((string[pos] == 0xE0) && (string[pos+1] < 0xA0 )) return 0;
/* Surrogates */
if ((string[pos] == 0xED) && (string[pos+1] > 0x9F )) return 0;
break;
case 3:
/* Out of range */
if ((string[pos] == 0xF0) && (string[pos+1] < 0x90 )) return 0;
if (string[pos] > 0xF4) return 0;
if ((string[pos] == 0xF4) && (string[pos+1] > 0x8F )) return 0;
break;
}
}
}
}

0 comments on commit 8a7e892

Please sign in to comment.
You can’t perform that action at this time.