Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit cf12a53
Showing
16 changed files
with
5,413 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,16 @@ | |||
/Makefile | |||
/blib | |||
/pm_to_blib | |||
/META.yml | |||
/MYMETA.yml | |||
/Makefile.PL | |||
/Build | |||
/_build | |||
/*.tar.gz | |||
/MANIFEST.bak | |||
/MANIFEST.SKIP.bak | |||
/cover_db | |||
/tmp/* | |||
/XML-Char-* | |||
/lib/XML/Char.c | |||
/lib/XML/Char.o |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,31 @@ | |||
use strict; | |||
use warnings; | |||
use Module::Build; | |||
use File::Spec; | |||
|
|||
my $builder = Module::Build->new( | |||
module_name => 'XML::Char', | |||
license => 'perl', | |||
dist_author => q{Jozef Kutej <jkutej@cpan.org>}, | |||
dist_version_from => 'lib/XML/Char.pm', | |||
build_requires => { | |||
'Test::More' => 0, | |||
}, | |||
add_to_cleanup => [ | |||
'XML-Char-*', | |||
File::Spec->catfile('lib', 'XML', 'Char.c'), | |||
File::Spec->catfile('lib', 'XML', 'Char.o') | |||
], | |||
create_makefile_pl => 'traditional', | |||
create_readme => 1, | |||
sign => 1, | |||
meta_merge => { | |||
resources => { | |||
repository => 'git://github.com/jozef/XML-Char.git', | |||
bugtracker => 'http://github.com/jozef/XML-Char/issues', | |||
}, | |||
keywords => [ qw/ xml characters utf8 utf-8 valid invalid / ], | |||
}, | |||
); | |||
|
|||
$builder->create_build_script(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,5 @@ | |||
Revision history for UTF8-Char | |||
|
|||
0.01 26 Jan 2010 | |||
First version, released on an unsuspecting world. | |||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,16 @@ | |||
Build.PL | |||
Changes | |||
lib/XML/Char.h | |||
lib/XML/Char.pm | |||
lib/XML/Char.xs | |||
lib/XML/ppport.h | |||
Makefile.PL | |||
MANIFEST This list of files | |||
META.yml | |||
README | |||
t/01_XML-Char.t | |||
t/distribution.t | |||
t/fixme.t | |||
t/pod-coverage.t | |||
t/pod-spell.t | |||
t/pod.t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,55 @@ | |||
|
|||
#!start included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP | |||
# Avoid version control files. | |||
\bRCS\b | |||
\bCVS\b | |||
\bSCCS\b | |||
,v$ | |||
\B\.svn\b | |||
\B\.git\b | |||
\B\.gitignore\b | |||
\b_darcs\b | |||
|
|||
# Avoid Makemaker generated and utility files. | |||
\bMANIFEST\.bak | |||
\bMakefile$ | |||
\bblib/ | |||
\bMakeMaker-\d | |||
\bpm_to_blib\.ts$ | |||
\bpm_to_blib$ | |||
\bblibdirs\.ts$ # 6.18 through 6.25 generated this | |||
|
|||
# Avoid Module::Build generated and utility files. | |||
\bBuild$ | |||
\b_build/ | |||
|
|||
# Avoid temp and backup files. | |||
~$ | |||
\.old$ | |||
\#$ | |||
\b\.# | |||
\.bak$ | |||
|
|||
# Avoid Devel::Cover files. | |||
\bcover_db\b | |||
#!end included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP | |||
|
|||
# Avoid configuration metadata file | |||
^MYMETA\. | |||
|
|||
# Avoid Module::Build generated and utility files. | |||
\bBuild$ | |||
\bBuild.bat$ | |||
\b_build | |||
\bBuild.COM$ | |||
\bBUILD.COM$ | |||
\bbuild.com$ | |||
^MANIFEST\.SKIP | |||
|
|||
# Avoid archives of this distribution | |||
\bXML-Char-[\d\.\_]+ | |||
|
|||
^lib/XML/Char.c | |||
^lib/XML/Char.o | |||
|
|||
^tmp/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,45 @@ | |||
NAME | |||
XML::Char - validate characters for XML | |||
|
|||
SYNOPSIS | |||
use XML::Char; | |||
if (not XML::Char->valid("bell ".chr(7))) { | |||
die 'no way to store this string directly to XML'; | |||
} | |||
|
|||
use XML::Char; | |||
if (XML::Char->valid("UTF8 je pořádný peklo")) { | |||
print "fuf, we are fine\n"; | |||
} | |||
|
|||
DESCRIPTION | |||
For me it was kind of a surprised to learn that `char(0)' is a valid | |||
UTF-8 character. All of the 0-0x7F are... | |||
|
|||
Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them | |||
|
|||
Well of course not, now I know :-) | |||
|
|||
http://www.w3.org/TR/REC-xml/#charsets defines which characters XML | |||
processors MUST accept: | |||
|
|||
[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | |||
/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ | |||
|
|||
This module validates if a given string meets this criteria. In addition | |||
the string has to be a Perl UTF-8 string (`is_utf8_string()' - see | |||
perlapi). | |||
|
|||
valid($value) | |||
Returns true or false if `$value' consists of valid UTF-8 XML | |||
characters. | |||
|
|||
AUTHOR | |||
Jozef Kutej | |||
|
|||
COPYRIGHT | |||
Copyright 2009 Jozef Kutej, all rights reserved. | |||
|
|||
This program is free software; you can redistribute it and/or modify it | |||
under the same terms as Perl itself. | |||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,3 @@ | |||
// ranges has to be sorted to work properly | |||
const unsigned long xml_ranges_from[] = {0x9, 0xA, 0xD, 0x20, 0xE000, 0x10000, 0}; | |||
const unsigned long xml_ranges_to[] = {0x9, 0xA, 0xD, 0xD7FF, 0xFFFD, 0x1FFFF, 0}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,96 @@ | |||
package XML::Char; | |||
|
|||
use utf8; | |||
|
|||
=head1 NAME | |||
XML::Char - validate characters for XML | |||
=head1 SYNOPSIS | |||
use XML::Char; | |||
if (not XML::Char->valid("bell ".chr(7))) { | |||
die 'no way to store this string directly to XML'; | |||
} | |||
use XML::Char; | |||
if (XML::Char->valid("UTF8 je pořádný peklo")) { | |||
print "fuf, we are fine\n"; | |||
} | |||
=head1 DESCRIPTION | |||
For me it was kind of a surprised to learn that C<char(0)> is a valid UTF-8 | |||
character. All of the 0-0x7F are... | |||
Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them | |||
Well of course not, now I know :-) | |||
L<http://www.w3.org/TR/REC-xml/#charsets> defines which characters XML processors MUST accept: | |||
[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | |||
/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ | |||
This module validates if a given string meets this criteria. In addition | |||
the string has to be a Perl UTF-8 string (C<is_utf8_string()> - see L<perlapi/Unicode-Support>). | |||
=cut | |||
|
|||
use 5.006; | |||
use strict; | |||
use warnings; | |||
|
|||
our $VERSION = '0.01'; | |||
|
|||
use parent qw(DynaLoader); | |||
|
|||
use Exporter 'import'; | |||
our @EXPORT_OK = qw( | |||
perlapi_is_utf8_string | |||
); | |||
|
|||
__PACKAGE__->bootstrap; | |||
|
|||
=head2 valid($value) | |||
Returns true or false if C<$value> consists of valid UTF-8 XML characters. | |||
=cut | |||
|
|||
sub valid { | |||
my ($self, $value); | |||
if (@_ < 2) { | |||
$self = __PACKAGE__; | |||
$value = shift @_; | |||
} | |||
else { | |||
$self = shift @_; | |||
$value = shift @_; | |||
} | |||
|
|||
die 'bad usage' | |||
if not eval { $self->can('valid') }; | |||
|
|||
# undef is valid | |||
return 1 | |||
if not defined $value; | |||
|
|||
return _valid_xml_string($value); | |||
} | |||
|
|||
1; | |||
|
|||
__END__ | |||
=head1 AUTHOR | |||
Jozef Kutej | |||
=head1 COPYRIGHT | |||
Copyright 2009 Jozef Kutej, all rights reserved. | |||
This program is free software; you can redistribute it and/or modify it | |||
under the same terms as Perl itself. | |||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,68 @@ | |||
#include "EXTERN.h" | |||
#include "perl.h" | |||
#include "XSUB.h" | |||
|
|||
#include "ppport.h" | |||
#include "Char.h" | |||
|
|||
|
|||
MODULE = XML::Char PACKAGE = XML::Char | |||
|
|||
int | |||
_valid_xml_string(string) | |||
SV* string; | |||
|
|||
PREINIT: | |||
STRLEN len; | |||
U8 * bytes; | |||
int in_range; | |||
int range_index; | |||
|
|||
STRLEN ret_len; | |||
UV uniuv; | |||
CODE: | |||
bytes = (U8*)SvPV(string, len); | |||
if (!is_utf8_string(bytes, len)) { | |||
// warn("no utf8\n"); | |||
|
|||
RETVAL = 0; | |||
} | |||
else { | |||
// by default return true (ex. empty string) | |||
RETVAL = 1; | |||
|
|||
// loop through all UTF-8 characters and make sure they are in allowed ranges | |||
while (len > 0) { | |||
// get unicode character value | |||
uniuv = utf8_to_uvuni(bytes, &ret_len); | |||
// warn("code: 0x%X len: %d\n", uniuv, ret_len); | |||
bytes += ret_len; | |||
len -= ret_len; | |||
|
|||
// loop through allowed ranges and check if the character is in any of them | |||
range_index = 0; | |||
in_range = 0; | |||
while (xml_ranges_from[range_index] != 0) { | |||
// rangers are sorted so if the unicode value is smaller than current range_from then it is not in any range | |||
if (uniuv < xml_ranges_from[range_index]) { | |||
break; | |||
} | |||
// if the unicode value fall in this range it's valid | |||
if ((uniuv >= xml_ranges_from[range_index]) && (uniuv <= xml_ranges_to[range_index])) { | |||
// in the range | |||
in_range = 1; | |||
break; | |||
} | |||
range_index++; | |||
} | |||
|
|||
// if the current character is not in allowed ranges return false | |||
if (!in_range) { | |||
RETVAL = 0; | |||
break; | |||
} | |||
} | |||
} | |||
|
|||
OUTPUT: | |||
RETVAL |
Oops, something went wrong.