Permalink
Browse files

initial

  • Loading branch information...
0 parents commit cf12a53ec06b5434c82d007fe4b8f5d406daa422 @jozef committed Jan 26, 2010
Showing with 5,413 additions and 0 deletions.
  1. +16 −0 .gitignore
  2. +31 −0 Build.PL
  3. +5 −0 Changes
  4. +16 −0 MANIFEST
  5. +55 −0 MANIFEST.SKIP
  6. +45 −0 README
  7. +3 −0 lib/XML/Char.h
  8. +96 −0 lib/XML/Char.pm
  9. +68 −0 lib/XML/Char.xs
  10. +4,954 −0 lib/XML/ppport.h
  11. +50 −0 t/01_XML-Char.t
  12. +4 −0 t/distribution.t
  13. +9 −0 t/fixme.t
  14. +18 −0 t/pod-coverage.t
  15. +31 −0 t/pod-spell.t
  16. +12 −0 t/pod.t
@@ -0,0 +1,16 @@
+/Makefile
+/blib
+/pm_to_blib
+/META.yml
+/MYMETA.yml
+/Makefile.PL
+/Build
+/_build
+/*.tar.gz
+/MANIFEST.bak
+/MANIFEST.SKIP.bak
+/cover_db
+/tmp/*
+/XML-Char-*
+/lib/XML/Char.c
+/lib/XML/Char.o
@@ -0,0 +1,31 @@
+use strict;
+use warnings;
+use Module::Build;
+use File::Spec;
+
+my $builder = Module::Build->new(
+ module_name => 'XML::Char',
+ license => 'perl',
+ dist_author => q{Jozef Kutej <jkutej@cpan.org>},
+ dist_version_from => 'lib/XML/Char.pm',
+ build_requires => {
+ 'Test::More' => 0,
+ },
+ add_to_cleanup => [
+ 'XML-Char-*',
+ File::Spec->catfile('lib', 'XML', 'Char.c'),
+ File::Spec->catfile('lib', 'XML', 'Char.o')
+ ],
+ create_makefile_pl => 'traditional',
+ create_readme => 1,
+ sign => 1,
+ meta_merge => {
+ resources => {
+ repository => 'git://github.com/jozef/XML-Char.git',
+ bugtracker => 'http://github.com/jozef/XML-Char/issues',
+ },
+ keywords => [ qw/ xml characters utf8 utf-8 valid invalid / ],
+ },
+);
+
+$builder->create_build_script();
@@ -0,0 +1,5 @@
+Revision history for UTF8-Char
+
+0.01 26 Jan 2010
+ First version, released on an unsuspecting world.
+
@@ -0,0 +1,16 @@
+Build.PL
+Changes
+lib/XML/Char.h
+lib/XML/Char.pm
+lib/XML/Char.xs
+lib/XML/ppport.h
+Makefile.PL
+MANIFEST This list of files
+META.yml
+README
+t/01_XML-Char.t
+t/distribution.t
+t/fixme.t
+t/pod-coverage.t
+t/pod-spell.t
+t/pod.t
@@ -0,0 +1,55 @@
+
+#!start included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP
+# Avoid version control files.
+\bRCS\b
+\bCVS\b
+\bSCCS\b
+,v$
+\B\.svn\b
+\B\.git\b
+\B\.gitignore\b
+\b_darcs\b
+
+# Avoid Makemaker generated and utility files.
+\bMANIFEST\.bak
+\bMakefile$
+\bblib/
+\bMakeMaker-\d
+\bpm_to_blib\.ts$
+\bpm_to_blib$
+\bblibdirs\.ts$ # 6.18 through 6.25 generated this
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\b_build/
+
+# Avoid temp and backup files.
+~$
+\.old$
+\#$
+\b\.#
+\.bak$
+
+# Avoid Devel::Cover files.
+\bcover_db\b
+#!end included /usr/share/perl/5.10/ExtUtils/MANIFEST.SKIP
+
+# Avoid configuration metadata file
+^MYMETA\.
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\bBuild.bat$
+\b_build
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+^MANIFEST\.SKIP
+
+# Avoid archives of this distribution
+\bXML-Char-[\d\.\_]+
+
+^lib/XML/Char.c
+^lib/XML/Char.o
+
+^tmp/*
45 README
@@ -0,0 +1,45 @@
+NAME
+ XML::Char - validate characters for XML
+
+SYNOPSIS
+ use XML::Char;
+ if (not XML::Char->valid("bell ".chr(7))) {
+ die 'no way to store this string directly to XML';
+ }
+
+ use XML::Char;
+ if (XML::Char->valid("UTF8 je pořádný peklo")) {
+ print "fuf, we are fine\n";
+ }
+
+DESCRIPTION
+ For me it was kind of a surprised to learn that `char(0)' is a valid
+ UTF-8 character. All of the 0-0x7F are...
+
+ Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them
+
+ Well of course not, now I know :-)
+
+ http://www.w3.org/TR/REC-xml/#charsets defines which characters XML
+ processors MUST accept:
+
+ [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+
+ This module validates if a given string meets this criteria. In addition
+ the string has to be a Perl UTF-8 string (`is_utf8_string()' - see
+ perlapi).
+
+ valid($value)
+ Returns true or false if `$value' consists of valid UTF-8 XML
+ characters.
+
+AUTHOR
+ Jozef Kutej
+
+COPYRIGHT
+ Copyright 2009 Jozef Kutej, all rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the same terms as Perl itself.
+
@@ -0,0 +1,3 @@
+// ranges has to be sorted to work properly
+const unsigned long xml_ranges_from[] = {0x9, 0xA, 0xD, 0x20, 0xE000, 0x10000, 0};
+const unsigned long xml_ranges_to[] = {0x9, 0xA, 0xD, 0xD7FF, 0xFFFD, 0x1FFFF, 0};
@@ -0,0 +1,96 @@
+package XML::Char;
+
+use utf8;
+
+=head1 NAME
+
+XML::Char - validate characters for XML
+
+=head1 SYNOPSIS
+
+ use XML::Char;
+ if (not XML::Char->valid("bell ".chr(7))) {
+ die 'no way to store this string directly to XML';
+ }
+
+ use XML::Char;
+ if (XML::Char->valid("UTF8 je pořádný peklo")) {
+ print "fuf, we are fine\n";
+ }
+
+=head1 DESCRIPTION
+
+For me it was kind of a surprised to learn that C<char(0)> is a valid UTF-8
+character. All of the 0-0x7F are...
+
+ Emo: well it's not because that they are valid utf-8 characters that you have to expect XML to accept them
+
+Well of course not, now I know :-)
+
+L<http://www.w3.org/TR/REC-xml/#charsets> defines which characters XML processors MUST accept:
+
+ [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+
+This module validates if a given string meets this criteria. In addition
+the string has to be a Perl UTF-8 string (C<is_utf8_string()> - see L<perlapi/Unicode-Support>).
+
+=cut
+
+use 5.006;
+use strict;
+use warnings;
+
+our $VERSION = '0.01';
+
+use parent qw(DynaLoader);
+
+use Exporter 'import';
+our @EXPORT_OK = qw(
+ perlapi_is_utf8_string
+);
+
+__PACKAGE__->bootstrap;
+
+=head2 valid($value)
+
+Returns true or false if C<$value> consists of valid UTF-8 XML characters.
+
+=cut
+
+sub valid {
+ my ($self, $value);
+ if (@_ < 2) {
+ $self = __PACKAGE__;
+ $value = shift @_;
+ }
+ else {
+ $self = shift @_;
+ $value = shift @_;
+ }
+
+ die 'bad usage'
+ if not eval { $self->can('valid') };
+
+ # undef is valid
+ return 1
+ if not defined $value;
+
+ return _valid_xml_string($value);
+}
+
+1;
+
+__END__
+
+=head1 AUTHOR
+
+Jozef Kutej
+
+=head1 COPYRIGHT
+
+Copyright 2009 Jozef Kutej, all rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
@@ -0,0 +1,68 @@
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#include "ppport.h"
+#include "Char.h"
+
+
+MODULE = XML::Char PACKAGE = XML::Char
+
+int
+_valid_xml_string(string)
+ SV* string;
+
+ PREINIT:
+ STRLEN len;
+ U8 * bytes;
+ int in_range;
+ int range_index;
+
+ STRLEN ret_len;
+ UV uniuv;
+ CODE:
+ bytes = (U8*)SvPV(string, len);
+ if (!is_utf8_string(bytes, len)) {
+ // warn("no utf8\n");
+
+ RETVAL = 0;
+ }
+ else {
+ // by default return true (ex. empty string)
+ RETVAL = 1;
+
+ // loop through all UTF-8 characters and make sure they are in allowed ranges
+ while (len > 0) {
+ // get unicode character value
+ uniuv = utf8_to_uvuni(bytes, &ret_len);
+ // warn("code: 0x%X len: %d\n", uniuv, ret_len);
+ bytes += ret_len;
+ len -= ret_len;
+
+ // loop through allowed ranges and check if the character is in any of them
+ range_index = 0;
+ in_range = 0;
+ while (xml_ranges_from[range_index] != 0) {
+ // rangers are sorted so if the unicode value is smaller than current range_from then it is not in any range
+ if (uniuv < xml_ranges_from[range_index]) {
+ break;
+ }
+ // if the unicode value fall in this range it's valid
+ if ((uniuv >= xml_ranges_from[range_index]) && (uniuv <= xml_ranges_to[range_index])) {
+ // in the range
+ in_range = 1;
+ break;
+ }
+ range_index++;
+ }
+
+ // if the current character is not in allowed ranges return false
+ if (!in_range) {
+ RETVAL = 0;
+ break;
+ }
+ }
+ }
+
+ OUTPUT:
+ RETVAL
Oops, something went wrong.

0 comments on commit cf12a53

Please sign in to comment.