Skip to content

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
jozef committed Jan 26, 2010
0 parents commit cf12a53
Show file tree
Hide file tree
Showing 16 changed files with 5,413 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .gitignore
@@ -0,0 +1,16 @@
/Makefile
/blib
/pm_to_blib
/META.yml
/MYMETA.yml
/Makefile.PL
/Build
/_build
/*.tar.gz
/MANIFEST.bak
/MANIFEST.SKIP.bak
/cover_db
/tmp/*
/XML-Char-*
/lib/XML/Char.c
/lib/XML/Char.o
31 changes: 31 additions & 0 deletions Build.PL
@@ -0,0 +1,31 @@
use strict;
use warnings;
use Module::Build;
use File::Spec;

my $builder = Module::Build->new(
module_name => 'XML::Char',
license => 'perl',
dist_author => q{Jozef Kutej <jkutej@cpan.org>},
dist_version_from => 'lib/XML/Char.pm',
build_requires => {
'Test::More' => 0,
},
add_to_cleanup => [
'XML-Char-*',
File::Spec->catfile('lib', 'XML', 'Char.c'),
File::Spec->catfile('lib', 'XML', 'Char.o')
],
create_makefile_pl => 'traditional',
create_readme => 1,
sign => 1,
meta_merge => {
resources => {
repository => 'git://github.com/jozef/XML-Char.git',
bugtracker => 'http://github.com/jozef/XML-Char/issues',
},
keywords => [ qw/ xml characters utf8 utf-8 valid invalid / ],
},
);

$builder->create_build_script();
5 changes: 5 additions & 0 deletions Changes
@@ -0,0 +1,5 @@
Revision history for UTF8-Char

0.01 26 Jan 2010
First version, released on an unsuspecting world.

16 changes: 16 additions & 0 deletions MANIFEST
@@ -0,0 +1,16 @@
Build.PL
Changes
lib/XML/Char.h
lib/XML/Char.pm
lib/XML/Char.xs
lib/XML/ppport.h
Makefile.PL
MANIFEST This list of files
META.yml
README
t/01_XML-Char.t
t/distribution.t
t/fixme.t
t/pod-coverage.t
t/pod-spell.t
t/pod.t
55 changes: 55 additions & 0 deletions MANIFEST.SKIP
@@ -0,0 +1,55 @@

#!start included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP
# Avoid version control files.
\bRCS\b
\bCVS\b
\bSCCS\b
,v$
\B\.svn\b
\B\.git\b
\B\.gitignore\b
\b_darcs\b

# Avoid Makemaker generated and utility files.
\bMANIFEST\.bak
\bMakefile$
\bblib/
\bMakeMaker-\d
\bpm_to_blib\.ts$
\bpm_to_blib$
\bblibdirs\.ts$ # 6.18 through 6.25 generated this

# Avoid Module::Build generated and utility files.
\bBuild$
\b_build/

# Avoid temp and backup files.
~$
\.old$
\#$
\b\.#
\.bak$

# Avoid Devel::Cover files.
\bcover_db\b
#!end included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP

# Avoid configuration metadata file
^MYMETA\.

# Avoid Module::Build generated and utility files.
\bBuild$
\bBuild.bat$
\b_build
\bBuild.COM$
\bBUILD.COM$
\bbuild.com$
^MANIFEST\.SKIP

# Avoid archives of this distribution
\bXML-Char-[\d\.\_]+

^lib/XML/Char.c
^lib/XML/Char.o

^tmp/*
45 changes: 45 additions & 0 deletions README
@@ -0,0 +1,45 @@
NAME
XML::Char - validate characters for XML

SYNOPSIS
use XML::Char;
if (not XML::Char->valid("bell ".chr(7))) {
die 'no way to store this string directly to XML';
}

use XML::Char;
if (XML::Char->valid("UTF8 je pořádný peklo")) {
print "fuf, we are fine\n";
}

DESCRIPTION
For me it was kind of a surprised to learn that `char(0)' is a valid
UTF-8 character. All of the 0-0x7F are...

Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them

Well of course not, now I know :-)

http://www.w3.org/TR/REC-xml/#charsets defines which characters XML
processors MUST accept:

[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */

This module validates if a given string meets this criteria. In addition
the string has to be a Perl UTF-8 string (`is_utf8_string()' - see
perlapi).

valid($value)
Returns true or false if `$value' consists of valid UTF-8 XML
characters.

AUTHOR
Jozef Kutej

COPYRIGHT
Copyright 2009 Jozef Kutej, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

3 changes: 3 additions & 0 deletions lib/XML/Char.h
@@ -0,0 +1,3 @@
// ranges has to be sorted to work properly
const unsigned long xml_ranges_from[] = {0x9, 0xA, 0xD, 0x20, 0xE000, 0x10000, 0};
const unsigned long xml_ranges_to[] = {0x9, 0xA, 0xD, 0xD7FF, 0xFFFD, 0x1FFFF, 0};
96 changes: 96 additions & 0 deletions lib/XML/Char.pm
@@ -0,0 +1,96 @@
package XML::Char;

use utf8;

=head1 NAME
XML::Char - validate characters for XML
=head1 SYNOPSIS
use XML::Char;
if (not XML::Char->valid("bell ".chr(7))) {
die 'no way to store this string directly to XML';
}
use XML::Char;
if (XML::Char->valid("UTF8 je pořádný peklo")) {
print "fuf, we are fine\n";
}
=head1 DESCRIPTION
For me it was kind of a surprised to learn that C<char(0)> is a valid UTF-8
character. All of the 0-0x7F are...
Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them
Well of course not, now I know :-)
L<http://www.w3.org/TR/REC-xml/#charsets> defines which characters XML processors MUST accept:
[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
This module validates if a given string meets this criteria. In addition
the string has to be a Perl UTF-8 string (C<is_utf8_string()> - see L<perlapi/Unicode-Support>).
=cut

use 5.006;
use strict;
use warnings;

our $VERSION = '0.01';

use parent qw(DynaLoader);

use Exporter 'import';
our @EXPORT_OK = qw(
perlapi_is_utf8_string
);

__PACKAGE__->bootstrap;

=head2 valid($value)
Returns true or false if C<$value> consists of valid UTF-8 XML characters.
=cut

sub valid {
my ($self, $value);
if (@_ < 2) {
$self = __PACKAGE__;
$value = shift @_;
}
else {
$self = shift @_;
$value = shift @_;
}

die 'bad usage'
if not eval { $self->can('valid') };

# undef is valid
return 1
if not defined $value;

return _valid_xml_string($value);
}

1;

__END__
=head1 AUTHOR
Jozef Kutej
=head1 COPYRIGHT
Copyright 2009 Jozef Kutej, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
68 changes: 68 additions & 0 deletions lib/XML/Char.xs
@@ -0,0 +1,68 @@
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"

#include "ppport.h"
#include "Char.h"


MODULE = XML::Char PACKAGE = XML::Char

int
_valid_xml_string(string)
SV* string;

PREINIT:
STRLEN len;
U8 * bytes;
int in_range;
int range_index;

STRLEN ret_len;
UV uniuv;
CODE:
bytes = (U8*)SvPV(string, len);
if (!is_utf8_string(bytes, len)) {
// warn("no utf8\n");

RETVAL = 0;
}
else {
// by default return true (ex. empty string)
RETVAL = 1;

// loop through all UTF-8 characters and make sure they are in allowed ranges
while (len > 0) {
// get unicode character value
uniuv = utf8_to_uvuni(bytes, &ret_len);
// warn("code: 0x%X len: %d\n", uniuv, ret_len);
bytes += ret_len;
len -= ret_len;

// loop through allowed ranges and check if the character is in any of them
range_index = 0;
in_range = 0;
while (xml_ranges_from[range_index] != 0) {
// rangers are sorted so if the unicode value is smaller than current range_from then it is not in any range
if (uniuv < xml_ranges_from[range_index]) {
break;
}
// if the unicode value fall in this range it's valid
if ((uniuv >= xml_ranges_from[range_index]) && (uniuv <= xml_ranges_to[range_index])) {
// in the range
in_range = 1;
break;
}
range_index++;
}

// if the current character is not in allowed ranges return false
if (!in_range) {
RETVAL = 0;
break;
}
}
}

OUTPUT:
RETVAL

0 comments on commit cf12a53

Please sign in to comment.