Skip to content

Commit

Permalink
02_utf8_flag.t, remove Char.h, and replace Char.xs with the following…
Browse files Browse the repository at this point in the history
  • Loading branch information
ap committed Jan 29, 2010
1 parent 50adeb9 commit 0b52a4c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 53 deletions.
3 changes: 0 additions & 3 deletions lib/XML/Char.h

This file was deleted.

79 changes: 29 additions & 50 deletions lib/XML/Char.xs
Expand Up @@ -3,66 +3,45 @@
#include "XSUB.h"

#include "ppport.h"
#include "Char.h"

static UV
octet_to_uvuni(const U8 *s, STRLEN *retlen)
{
*retlen = 1;
return (UV) *s;
}

MODULE = XML::Char PACKAGE = XML::Char
MODULE = XML::Char PACKAGE = XML::Char

int
void
_valid_xml_string(string)
SV* string;

PREINIT:
STRLEN len;
U8 * bytes;
int in_range;
int range_index;

STRLEN ret_len;
UV uniuv;
CODE:
bytes = (U8*)SvPV(string, len);
if (!is_utf8_string(bytes, len)) {
// warn("no utf8\n");

RETVAL = 0;
}
else {
// by default return true (ex. empty string)
RETVAL = 1;

// loop through all UTF-8 characters and make sure they are in allowed ranges
while (len > 0) {
// get unicode character value
uniuv = utf8_to_uvuni(bytes, &ret_len);
// warn("code: 0x%X len: %d\n", uniuv, ret_len);
bytes += ret_len;
len -= ret_len;

// loop through allowed ranges and check if the character is in any of them
range_index = 0;
in_range = 0;
while (xml_ranges_from[range_index] != 0) {
// rangers are sorted so if the unicode value is smaller than current range_from then it is not in any range
if (uniuv < xml_ranges_from[range_index]) {
break;
}
// if the unicode value fall in this range it's valid
if ((uniuv >= xml_ranges_from[range_index]) && (uniuv <= xml_ranges_to[range_index])) {
// in the range
in_range = 1;
break;
}
range_index++;
}

// if the current character is not in allowed ranges return false
if (!in_range) {
RETVAL = 0;
break;
}
}
UV (*next_chr)(const U8 *s, STRLEN *retlen);

PPCODE:
bytes = (U8*)SvPV(string, len);
next_chr = SvUTF8(string) ? &utf8_to_uvuni : &octet_to_uvuni;

while (len > 0) {
uniuv = (*next_chr)(bytes, &ret_len);
bytes += ret_len;
len -= ret_len;

if (
(uniuv < 0x20) && (uniuv != 0x9) && (uniuv != 0xA) && (uniuv != 0xD)
|| (uniuv > 0xD7FF) && (uniuv < 0xE000)
|| (uniuv > 0xFFFD) && (uniuv < 0x10000)
|| (uniuv > 0x1FFFF)
) XSRETURN_NO;
}

OUTPUT:
RETVAL

XSRETURN_YES;
14 changes: 14 additions & 0 deletions t/02_utf8_flag.t
@@ -0,0 +1,14 @@
#!/usr/bin/perl

use strict;
use warnings;
use utf8 ();

use Test::More tests => 2;
use XML::Char;

my $str = "\xC3";
is( XML::Char->valid($str), !!1, "accept U+00C3 with UTF8 flag off" );

utf8::upgrade($str);
is( XML::Char->valid($str), !!1, "accept U+00C3 with UTF8 flag on" );

0 comments on commit 0b52a4c

Please sign in to comment.