Browse files

Issue #92 - Performance of Verifier.

Initial commit with Byte-per-Character system for bitmask-based lookup tables.
See details at https://github.com/hunterhacker/jdom/wiki/Verifier-Performance
  • Loading branch information...
1 parent 500f9e5 commit 4ad684aa0427ee4f8feec8bbe360d6e32e32771c @rolfl rolfl committed Sep 2, 2012
View
149 contrib/src/java/org/jdom2/contrib/perf/PerfVerifier.java
@@ -0,0 +1,149 @@
+package org.jdom2.contrib.perf;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+
+import org.jdom2.Verifier;
+
+/**
+ * This class is designed to test the performance of the JDOM Verifier.
+ * It does that by repeatedly parsing a document through a verifierd JDOM sequence, and then
+ * comparing the time to a non-verified parsing. The time differences is accounted for by
+ * verifying overhead.
+ *
+ * @author Rolf Lear
+ *
+ */
+public class PerfVerifier {
+
+ @SuppressWarnings("javadoc")
+ public static void main(final String[] args) {
+ if (args.length != 1) {
+ throw new IllegalArgumentException("We expect a single directory argument.");
+ }
+ final File dir = new File(args[0]);
+ if (!dir.isDirectory()) {
+ throw new IllegalArgumentException("We expect a single directory argument.");
+ }
+
+ long sattnanos = 0L;
+ long semtnanos = 0L;
+ long schrnanos = 0L;
+ long start = 0L;
+
+ System.out.println("Loading data");
+
+ final String[] attnames = parseFile(new File(dir, "checkAttributeName.txt"));
+ final String[] emtnames = parseFile(new File(dir, "checkElementName.txt"));
+ final String[] chardata = parseFile(new File(dir, "checkCharacterData.txt"));
+
+
+ System.out.println("Stabilize");
+ final long prebytes = getMemUsed();
+
+ System.out.println("Launch");
+
+ int i = 0;
+ int cnt = 18;
+ while (--cnt >= 0) {
+ long attnanos = 0L;
+ long emtnanos = 0L;
+ long chrnanos = 0L;
+
+ start = System.nanoTime();
+ for (i = attnames.length - 1; i >= 0; i--) {
+ Verifier.checkAttributeName(attnames[i]);
+ }
+ attnanos = System.nanoTime() - start;
+
+ start = System.nanoTime();
+ for (i = emtnames.length - 1; i >= 0; i--) {
+ Verifier.checkElementName(emtnames[i]);
+ }
+ emtnanos = System.nanoTime() - start;
+
+ start = System.nanoTime();
+ for (i = chardata.length - 1; i >= 0; i--) {
+ Verifier.checkCharacterData(chardata[i]);
+ }
+ chrnanos = System.nanoTime() - start;
+
+ if (cnt >= 10) {
+ System.out.printf(" Warmup %2d took: att=%.3fms emt=%.3fms char=%.3fms\n", cnt,
+ attnanos / 1000000.0, emtnanos / 1000000.0, chrnanos / 1000000.0);
+ } else {
+ System.out.printf(" Loop %2d took: att=%.3fms emt=%.3fms char=%.3fms\n", cnt,
+ attnanos / 1000000.0, emtnanos / 1000000.0, chrnanos / 1000000.0);
+
+ sattnanos += attnanos;
+ semtnanos += emtnanos;
+ schrnanos += chrnanos;
+ }
+
+ }
+
+ final long memused = getMemUsed() - prebytes;
+
+ System.out.printf("Validating took: att=%.3fms emt=%.3fms char=%.3fms mem=%.3fKB\n",
+ sattnanos / 1000000.0, semtnanos / 1000000.0, schrnanos / 1000000.0,
+ memused / 1024.0);
+
+ Verifier.isAllXMLWhitespace(" ");
+ System.out.println("Checks " + (chardata.length + emtnames.length + attnames.length));
+ }
+
+ private static long getMemUsed() {
+ long minused = Long.MAX_VALUE;
+ int cnt = 0;
+ final Runtime rt = Runtime.getRuntime();
+ try {
+ while (cnt < 3) {
+ System.gc();
+ Thread.yield();
+ Thread.sleep(100);
+ long used = rt.totalMemory() - rt.freeMemory();
+ if (used < minused) {
+ cnt = 0;
+ minused = used;
+ } else {
+ cnt++;
+ }
+ }
+ } catch (InterruptedException ie) {
+ throw new IllegalStateException("Interrupted", ie);
+ }
+ return minused;
+ }
+
+ private static final String[] parseFile(File file) {
+ try {
+ final StringBuilder sb = new StringBuilder(1024);
+ final ArrayList<String> vals = new ArrayList<String>(10240);
+ final FileInputStream fis = new FileInputStream(file);
+ final InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8"));
+ System.out.println("Loading " + file.getPath());
+ int c = 0;
+ while ((c = isr.read()) >= 0) {
+ if (c == 0) {
+ vals.add(sb.toString());
+ sb.setLength(0);
+ } else {
+ sb.append((char)c);
+ }
+ }
+ fis.close();
+
+ final String[] ret = new String[vals.size()];
+ vals.toArray(ret);
+ return ret;
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ return null;
+ }
+ }
+
+}
View
1,476 contrib/src/java/org/jdom2/contrib/perf/SavingVerifier.java
@@ -0,0 +1,1476 @@
+/*--
+
+ Copyright (C) 2000-2012 Jason Hunter & Brett McLaughlin.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the disclaimer that follows
+ these conditions in the documentation and/or other materials
+ provided with the distribution.
+
+ 3. The name "JDOM" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact <request_AT_jdom_DOT_org>.
+
+ 4. Products derived from this software may not be called "JDOM", nor
+ may "JDOM" appear in their name, without prior written permission
+ from the JDOM Project Management <request_AT_jdom_DOT_org>.
+
+ In addition, we request (but do not require) that you include in the
+ end-user documentation provided with the redistribution and/or in the
+ software itself an acknowledgement equivalent to the following:
+ "This product includes software developed by the
+ JDOM Project (http://www.jdom.org/)."
+ Alternatively, the acknowledgment may be graphical using the logos
+ available at http://www.jdom.org/images/logos.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the JDOM Project and was originally
+ created by Jason Hunter <jhunter_AT_jdom_DOT_org> and
+ Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information
+ on the JDOM Project, please see <http://www.jdom.org/>.
+
+ */
+
+package org.jdom2.contrib.perf;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.jdom2.Attribute;
+import org.jdom2.CDATA;
+import org.jdom2.Comment;
+import org.jdom2.DocType;
+import org.jdom2.Element;
+import org.jdom2.EntityRef;
+import org.jdom2.Namespace;
+import org.jdom2.ProcessingInstruction;
+
+/**
+ * NOTE
+ * ====
+ * This is a clone of the pre-bitmask Verifier.java file, and then modified to 'save' the
+ * checked data from the verifier. Used with PerfVerifier to test the Verfier performance.
+ *
+ *
+ * Old Comments:
+ * =============
+ * A utility class to handle well-formedness checks on names, data, and other
+ * verification tasks for JDOM. The class is final and may not be subclassed.
+ *
+ * @author Brett McLaughlin
+ * @author Elliotte Rusty Harold
+ * @author Jason Hunter
+ * @author Bradley S. Huffman
+ * @author Rolf Lear
+ */
+final public class SavingVerifier {
+
+ private static final AtomicBoolean open = new AtomicBoolean(true);
+
+ private static final Writer getWriter(final String name) {
+ try {
+ final FileOutputStream fos = new FileOutputStream(name);
+ final OutputStreamWriter osw = new OutputStreamWriter(fos, Charset.forName("UTF-8"));
+ return new BufferedWriter(osw);
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ open.set(false);
+ return null;
+ }
+ }
+
+ private static final void write(Writer writer, String value) {
+ if (open.get()) {
+ try {
+ writer.write(value);
+ writer.write((char)0);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ private static final String elementfilename = "checkElementName.txt";
+ private static final String attributefilename = "checkAttributeName.txt";
+ private static final String chardatafilename = "checkCharacterData.txt";
+
+ private static final Writer elementnamewriter = getWriter(elementfilename);
+ private static final Writer attributenamewriter = getWriter(attributefilename);
+ private static final Writer characterdatawriter = getWriter(chardatafilename);
+
+ public static final void closeWriters() throws IOException {
+ if (!open.getAndSet(false)) {
+ return;
+ }
+ open.set(false);
+ elementnamewriter.flush();
+ elementnamewriter.close();
+ attributenamewriter.flush();
+ attributenamewriter.close();
+ characterdatawriter.flush();
+ characterdatawriter.close();
+ }
+
+ /**
+ * Ensure instantation cannot occur.
+ */
+ private SavingVerifier() { }
+
+ /**
+ * This will check the supplied name to see if it is legal for use as
+ * a JDOM <code>{@link Element}</code> name.
+ *
+ * @param name <code>String</code> name to check.
+ * @return <code>String</code> reason name is illegal, or
+ * <code>null</code> if name is OK.
+ */
+ public static String checkElementName(final String name) {
+
+ write(elementnamewriter, name);
+
+ // Check basic XML name rules first
+ if (checkXMLName(name) != null) {
+ return checkXMLName(name);
+ }
+
+ // No colons allowed, since elements handle this internally
+ if (name.indexOf(":") != -1) {
+ return "Element names cannot contain colons";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied name to see if it is legal for use as
+ * a JDOM <code>{@link Attribute}</code> name.
+ *
+ * @param name <code>String</code> name to check.
+ * @return <code>String</code> reason name is illegal, or
+ * <code>null</code> if name is OK.
+ */
+ public static String checkAttributeName(final String name) {
+
+ write(attributenamewriter, name);
+
+ // Check basic XML name rules first
+ if (checkXMLName(name) != null) {
+ return checkXMLName(name);
+ }
+
+ // No colons are allowed, since attributes handle this internally
+ if (name.indexOf(":") != -1) {
+ return "Attribute names cannot contain colons";
+ }
+
+ // Attribute names may not be xmlns since we do this internally too
+ if (name.equals("xmlns")) {
+ return "An Attribute name may not be \"xmlns\"; " +
+ "use the Namespace class to manage namespaces";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied string to see if it only contains
+ * characters allowed by the XML 1.0 specification. The C0 controls
+ * (e.g. null, vertical tab, form-feed, etc.) are specifically excluded
+ * except for carriage return, line-feed, and the horizontal tab.
+ * Surrogates are also excluded.
+ * <p>
+ * This method is useful for checking element content and attribute
+ * values. Note that characters
+ * like " and &lt; are allowed in attribute values and element content.
+ * They will simply be escaped as &quot; or &lt;
+ * when the value is serialized.
+ * </p>
+ *
+ * @param text <code>String</code> value to check.
+ * @return <code>String</code> reason name is illegal, or
+ * <code>null</code> if name is OK.
+ */
+ public static String checkCharacterData(final String text) {
+ write(characterdatawriter, text);
+ if (text == null) {
+ return "A null is not a legal XML value";
+ }
+
+ // lowx indicates we expect a low surrogate next.
+ boolean lowx = false;
+ final int len = text.length();
+ for (int i = 0; i < len; i++) {
+ // we are expecting a normal char, but may be a surrogate.
+ if (isXMLCharacter(text.charAt(i))) {
+ if (lowx) {
+ // we got a normal character, but we wanted a low surrogate
+ return String.format("Illegal Surrogate Pair 0x%04x%04x",
+ (int)text.charAt(i - 1), (int)text.charAt(i));
+ }
+ } else {
+ // the character is not a normal character.
+ // we need to sort out what it is. Neither high nor low
+ // surrogate pairs are valid characters, so they will get here.
+
+ if (!lowx && isHighSurrogate(text.charAt(i))) {
+ // we have the valid high char of a pair.
+ // we will expect the low char on the next loop through,
+ // so mark the high char, and move on.
+ lowx = true;
+ } else if (lowx && isLowSurrogate(text.charAt(i))) {
+ // we now have the low char of a pair, decode and validate
+ final int chi = decodeSurrogatePair(
+ text.charAt(i - 1), text.charAt(i));
+ if (!isXMLCharacter(chi)) {
+ // Likely this character can't be easily displayed
+ // because it's a control so we use it'd hexadecimal
+ // representation in the reason.
+ return String.format("0x%06x is not a legal XML character",
+ chi);
+ }
+ lowx = false;
+ } else {
+ // Likely this character can't be easily displayed
+ // because it's a control so we use it's hexadecimal
+ // representation in the reason.
+ return String.format("0x%04x is not a legal XML character",
+ (int)text.charAt(i));
+ }
+ }
+ }
+
+ if (lowx) {
+ return String.format("Truncated Surrogate Pair 0x%04x????",
+ (int)text.charAt(text.length() - 1));
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied data to see if it is legal for use as
+ * JDOM <code>{@link CDATA}</code>.
+ *
+ * @param data <code>String</code> data to check.
+ * @return <code>String</code> reason data is illegal, or
+ * <code>null</code> is name is OK.
+ */
+ public static String checkCDATASection(final String data) {
+ String reason = null;
+ if ((reason = checkCharacterData(data)) != null) {
+ return reason;
+ }
+
+ if (data.indexOf("]]>") != -1) {
+ return "CDATA cannot internally contain a CDATA ending " +
+ "delimiter (]]>)";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied name to see if it is legal for use as
+ * a JDOM <code>{@link Namespace}</code> prefix.
+ *
+ * @param prefix <code>String</code> prefix to check.
+ * @return <code>String</code> reason name is illegal, or
+ * <code>null</code> if name is OK.
+ */
+ public static String checkNamespacePrefix(final String prefix) {
+ // Manually do rules, since URIs can be null or empty
+ if ((prefix == null) || (prefix.equals(""))) {
+ return null;
+ }
+
+ // Cannot start with a number
+ final char first = prefix.charAt(0);
+ if (isXMLDigit(first)) {
+ return "Namespace prefixes cannot begin with a number";
+ }
+ // Cannot start with a $
+ if (first == '$') {
+ return "Namespace prefixes cannot begin with a dollar sign ($)";
+ }
+ // Cannot start with a -
+ if (first == '-') {
+ return "Namespace prefixes cannot begin with a hyphen (-)";
+ }
+ // Cannot start with a .
+ if (first == '.') {
+ return "Namespace prefixes cannot begin with a period (.)";
+ }
+ // Cannot start with "xml" in any character case
+ if (prefix.toLowerCase().startsWith("xml")) {
+ return "Namespace prefixes cannot begin with " +
+ "\"xml\" in any combination of case";
+ }
+
+ // Ensure legal content
+ for (int i=0, len = prefix.length(); i<len; i++) {
+ final char c = prefix.charAt(i);
+ if (!isXMLNameCharacter(c)) {
+ return "Namespace prefixes cannot contain the character \"" +
+ c + "\"";
+ }
+ }
+
+ // No colons allowed
+ if (prefix.indexOf(":") != -1) {
+ return "Namespace prefixes cannot contain colons";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied name to see if it is legal for use as
+ * a JDOM <code>{@link Namespace}</code> URI.
+ * <p>
+ * This is a 'light' test of URI's designed to filter out only the worst
+ * illegal URIs. It tests only to ensure the first character is valid. A
+ * comprehensive URI validation process would be impractical.
+ *
+ * @param uri <code>String</code> URI to check.
+ * @return <code>String</code> reason name is illegal, or
+ * <code>null</code> if name is OK.
+ */
+ public static String checkNamespaceURI(final String uri) {
+ // Manually do rules, since URIs can be null or empty
+ if ((uri == null) || (uri.equals(""))) {
+ return null;
+ }
+
+ // Cannot start with a number
+ final char first = uri.charAt(0);
+ if (Character.isDigit(first)) {
+ return "Namespace URIs cannot begin with a number";
+ }
+ // Cannot start with a $
+ if (first == '$') {
+ return "Namespace URIs cannot begin with a dollar sign ($)";
+ }
+ // Cannot start with a -
+ if (first == '-') {
+ return "Namespace URIs cannot begin with a hyphen (-)";
+ }
+
+ // Cannot start with space...
+ if (isXMLWhitespace(first)) {
+ return "Namespace URIs cannot begin with white-space";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * Check if two namespaces collide.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param other <code>Namespace</code> to check against.
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final Namespace other) {
+ String p1,p2,u1,u2,reason;
+
+ reason = null;
+ p1 = namespace.getPrefix();
+ u1 = namespace.getURI();
+ p2 = other.getPrefix();
+ u2 = other.getURI();
+ if (p1.equals(p2) && !u1.equals(u2)) {
+ reason = "The namespace prefix \"" + p1 + "\" collides";
+ }
+ return reason;
+ }
+
+ /**
+ * Check if <code>{@link Attribute}</code>'s namespace collides with a
+ * <code>{@link Element}</code>'s namespace.
+ *
+ * @param attribute <code>Attribute</code> to check.
+ * @param element <code>Element</code> to check against.
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Attribute attribute,
+ final Element element) {
+ return checkNamespaceCollision(attribute, element, -1);
+ }
+
+ /**
+ * Check if <code>{@link Attribute}</code>'s namespace collides with a
+ * <code>{@link Element}</code>'s namespace.
+ *
+ * @param attribute <code>Attribute</code> to check.
+ * @param element <code>Element</code> to check against.
+ * @param ignoreatt Ignore a specific Attribute (if it exists) when
+ * calculating any collisions (used when replacing one attribute
+ * with another).
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Attribute attribute,
+ final Element element, final int ignoreatt) {
+ final Namespace namespace = attribute.getNamespace();
+ final String prefix = namespace.getPrefix();
+ if ("".equals(prefix)) {
+ return null;
+ }
+
+ return checkNamespaceCollision(namespace, element, ignoreatt);
+ }
+
+ /**
+ * Check if a <code>{@link Namespace}</code> collides with a
+ * <code>{@link Element}</code>'s namespace.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param element <code>Element</code> to check against.
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final Element element) {
+ return checkNamespaceCollision(namespace, element, -1);
+ }
+
+ /**
+ * Check if a <code>{@link Namespace}</code> collides with a
+ * <code>{@link Element}</code>'s namespace.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param element <code>Element</code> to check against.
+ * @param ignoreatt Ignore a specific Attribute (if it exists) when
+ * calculating any collisions (used when replacing one attribute
+ * with another).
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final Element element, final int ignoreatt) {
+ String reason = checkNamespaceCollision(namespace,
+ element.getNamespace());
+ if (reason != null) {
+ return reason + " with the element namespace prefix";
+ }
+
+ if (element.hasAdditionalNamespaces()) {
+ reason = checkNamespaceCollision(namespace,
+ element.getAdditionalNamespaces());
+ if (reason != null) {
+ return reason;
+ }
+ }
+
+ if (element.hasAttributes()) {
+ reason = checkNamespaceCollision(namespace, element.getAttributes(), ignoreatt);
+ if (reason != null) {
+ return reason;
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Check if a <code>{@link Namespace}</code> collides with a
+ * <code>{@link Attribute}</code>'s namespace.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param attribute <code>Attribute</code> to check against.
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final Attribute attribute) {
+ String reason = null;
+ if (!attribute.getNamespace().equals(Namespace.NO_NAMESPACE)) {
+ reason = checkNamespaceCollision(namespace,
+ attribute.getNamespace());
+ if (reason != null) {
+ reason += " with an attribute namespace prefix on the element";
+ }
+ }
+ return reason;
+ }
+
+ /**
+ * Check if a <code>{@link Namespace}</code> collides with any namespace
+ * from a list of objects.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param list <code>List</code> to check against.
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final List<?> list) {
+ return checkNamespaceCollision(namespace, list, -1);
+ }
+
+ /**
+ * Check if a <code>{@link Namespace}</code> collides with any namespace
+ * from a list of objects.
+ *
+ * @param namespace <code>Namespace</code> to check.
+ * @param list <code>List</code> to check against.
+ * @param ignoreatt Ignore a specific Attribute (if it exists) when
+ * calculating any collisions (used when replacing one attribute
+ * with another).
+ * @return <code>String</code> reason for collision, or
+ * <code>null</code> if no collision.
+ */
+ public static String checkNamespaceCollision(final Namespace namespace,
+ final List<?> list, final int ignoreatt) {
+ if (list == null) {
+ return null;
+ }
+
+ String reason = null;
+ final Iterator<?> i = list.iterator();
+ int cnt = -1;
+ while ((reason == null) && i.hasNext()) {
+ final Object obj = i.next();
+ cnt++;
+ if (obj instanceof Attribute) {
+ if (cnt == ignoreatt) {
+ continue;
+ }
+ reason = checkNamespaceCollision(namespace, (Attribute) obj);
+ }
+ else if (obj instanceof Element) {
+ reason = checkNamespaceCollision(namespace, (Element) obj);
+ }
+ else if (obj instanceof Namespace) {
+ reason = checkNamespaceCollision(namespace, (Namespace) obj);
+ if (reason != null) {
+ reason += " with an additional namespace declared" +
+ " by the element";
+ }
+ }
+ }
+ return reason;
+ }
+
+ /**
+ * This will check the supplied data to see if it is legal for use as
+ * a JDOM <code>{@link ProcessingInstruction}</code> target.
+ *
+ * @param target <code>String</code> target to check.
+ * @return <code>String</code> reason target is illegal, or
+ * <code>null</code> if target is OK.
+ */
+ public static String checkProcessingInstructionTarget(final String target) {
+ // Check basic XML name rules first
+ String reason;
+ if ((reason = checkXMLName(target)) != null) {
+ return reason;
+ }
+
+ // No colons allowed, per Namespace Specification Section 6
+ if (target.indexOf(":") != -1) {
+ return "Processing instruction targets cannot contain colons";
+ }
+
+ // Cannot begin with 'xml' in any case
+ if (target.equalsIgnoreCase("xml")) {
+ return "Processing instructions cannot have a target of " +
+ "\"xml\" in any combination of case. (Note that the " +
+ "\"<?xml ... ?>\" declaration at the beginning of a " +
+ "document is not a processing instruction and should not " +
+ "be added as one; it is written automatically during " +
+ "output, e.g. by XMLOutputter.)";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * This will check the supplied data to see if it is legal for use as
+ * <code>{@link ProcessingInstruction}</code> data. Besides checking that
+ * all the characters are allowed in XML, this also checks
+ * that the data does not contain the PI end-string "?&gt;".
+ *
+ * @param data <code>String</code> data to check.
+ * @return <code>String</code> reason data is illegal, or
+ * <code>null</code> if data is OK.
+ */
+ public static String checkProcessingInstructionData(final String data) {
+ // Check basic XML name rules first
+ final String reason = checkCharacterData(data);
+
+ if (reason == null) {
+ if (data.indexOf("?>") >= 0) {
+ return "Processing instructions cannot contain " +
+ "the string \"?>\"";
+ }
+ }
+
+ return reason;
+ }
+
+ /**
+ * This will check the supplied data to see if it is legal for use as
+ * JDOM <code>{@link Comment}</code> data.
+ *
+ * @param data <code>String</code> data to check.
+ * @return <code>String</code> reason data is illegal, or
+ * <code>null</code> if data is OK.
+ */
+ public static String checkCommentData(final String data) {
+ String reason = null;
+ if ((reason = checkCharacterData(data)) != null) {
+ return reason;
+ }
+
+ if (data.indexOf("--") != -1) {
+ return "Comments cannot contain double hyphens (--)";
+ }
+ if (data.endsWith("-")) {
+ return "Comment data cannot end with a hyphen.";
+ }
+
+ // If we got here, everything is OK
+ return null;
+ }
+ /**
+ * This is a utility function to decode a non-BMP
+ * UTF-16 surrogate pair.
+ * @param high high 16 bits
+ * @param low low 16 bits
+ * @return decoded character
+ */
+ public static int decodeSurrogatePair(final char high, final char low) {
+ return 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00);
+ }
+
+ /**
+ * This will check the supplied data to see if it is legal for use as
+ * PublicID (in a {@link DocType} or {@link EntityRef}).
+ *
+ * @param c the character to validate
+ * @return <code>String</code> reason <i>c</i> is illegal, or
+ * <code>null</code> if <i>c</i> is OK.
+ */
+ public static boolean isXMLPublicIDCharacter(final char c) {
+ // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] |
+ // [-'()+,./:=?;*#@$_%]
+
+ if (c >= 'a' && c <= 'z') return true;
+ if (c >= '?' && c <= 'Z') return true;
+ if (c >= '\'' && c <= ';') return true;
+
+ if (c == ' ') return true;
+ if (c == '!') return true;
+ if (c == '=') return true;
+ if (c == '#') return true;
+ if (c == '$') return true;
+ if (c == '_') return true;
+ if (c == '%') return true;
+ if (c == '\n') return true;
+ if (c == '\r') return true;
+ if (c == '\t') return true;
+
+ return false;
+ }
+
+ /**
+ * This will ensure that the data for a public identifier
+ * is legal.
+ *
+ * @param publicID <code>String</code> public ID to check.
+ * @return <code>String</code> reason public ID is illegal, or
+ * <code>null</code> if public ID is OK.
+ */
+ public static String checkPublicID(final String publicID) {
+ String reason = null;
+
+ if (publicID == null) return null;
+ // This indicates there is no public ID
+
+ for (int i = 0; i < publicID.length(); i++) {
+ final char c = publicID.charAt(i);
+ if (!isXMLPublicIDCharacter(c)) {
+ reason = c + " is not a legal character in public IDs";
+ break;
+ }
+ }
+
+ return reason;
+ }
+
+
+ /**
+ * This will ensure that the data for a system literal
+ * is legal.
+ *
+ * @param systemLiteral <code>String</code> system literal to check.
+ * @return <code>String</code> reason system literal is illegal, or
+ * <code>null</code> if system literal is OK.
+ */
+ public static String checkSystemLiteral(final String systemLiteral) {
+ String reason = null;
+
+ if (systemLiteral == null) return null;
+ // This indicates there is no system ID
+
+ if (systemLiteral.indexOf('\'') != -1
+ && systemLiteral.indexOf('"') != -1) {
+ reason =
+ "System literals cannot simultaneously contain both single and double quotes.";
+ }
+ else {
+ reason = checkCharacterData(systemLiteral);
+ }
+
+ return reason;
+ }
+
+ /**
+ * This is a utility function for sharing the base process of checking
+ * any XML name.
+ *
+ * @param name <code>String</code> to check for XML name compliance.
+ * @return <code>String</code> reason the name is illegal, or
+ * <code>null</code> if OK.
+ */
+ public static String checkXMLName(final String name) {
+ // Cannot be empty or null
+ if ((name == null)) {
+ return "XML names cannot be null";
+ }
+
+ final int len = name.length();
+ if (len == 0) {
+ return "XML names cannot be empty";
+ }
+
+
+ // Cannot start with a number
+ if (!isXMLNameStartCharacter(name.charAt(0))) {
+ return "XML names cannot begin with the character \"" +
+ name.charAt(0) + "\"";
+ }
+ // Ensure legal content for non-first chars
+ for (int i = 1; i < len; i++) {
+ if (!isXMLNameCharacter(name.charAt(i))) {
+ return "XML names cannot contain the character \"" + name.charAt(i) + "\"";
+ }
+ }
+
+ // We got here, so everything is OK
+ return null;
+ }
+
+ /**
+ * <p>
+ * Checks a string to see if it is a legal RFC 2396 URI.
+ * Both absolute and relative URIs are supported.
+ * </p>
+ *
+ * @param uri <code>String</code> to check.
+ * @return <code>String</code> reason the URI is illegal, or
+ * <code>null</code> if OK.
+ */
+ public static String checkURI(final String uri) {
+ // URIs can be null or empty
+ if ((uri == null) || (uri.equals(""))) {
+ return null;
+ }
+
+ for (int i = 0; i < uri.length(); i++) {
+ final char test = uri.charAt(i);
+ if (!isURICharacter(test)) {
+ String msgNumber = "0x" + Integer.toHexString(test);
+ if (test <= 0x09) msgNumber = "0x0" + Integer.toHexString(test);
+ return "URIs cannot contain " + msgNumber;
+ } // end if
+ if (test == '%') { // must be followed by two hexadecimal digits
+ try {
+ final char firstDigit = uri.charAt(i+1);
+ final char secondDigit = uri.charAt(i+2);
+ if (!isHexDigit(firstDigit) ||
+ !isHexDigit(secondDigit)) {
+ return "Percent signs in URIs must be followed by "
+ + "exactly two hexadecimal digits.";
+ }
+
+ }
+ catch (final StringIndexOutOfBoundsException e) {
+ return "Percent signs in URIs must be followed by "
+ + "exactly two hexadecimal digits.";
+ }
+ }
+ } // end for
+
+ // If we got here, everything is OK
+ return null;
+ }
+
+ /**
+ * <p>
+ * This is a utility function for determining whether a specified
+ * Unicode character is a hexadecimal digit as defined in RFC 2396;
+ * that is, one of the ASCII characters 0-9, a-f, or A-F.
+ * </p>
+ *
+ * @param c to check for hex digit.
+ * @return true if it's allowed, false otherwise.
+ */
+ public static boolean isHexDigit(final char c) {
+
+ // I suspect most characters passed to this method will be
+ // correct hexadecimal digits, so I test for the true cases
+ // first. If this proves to be a performance bottleneck
+ // a switch statement or lookup table
+ // might optimize this.
+ if (c >= '0' && c <= '9') return true;
+ if (c >= 'A' && c <= 'F') return true;
+ if (c >= 'a' && c <= 'f') return true;
+
+ return false;
+ }
+
+ /**
+ * This is a function for determining whether the
+ * specified character is the high 16 bits in a
+ * UTF-16 surrogate pair.
+ * @param ch character to check
+ * @return true if the character is a high surrogate, false otherwise
+ */
+ public static boolean isHighSurrogate(final char ch) {
+ // faster way to do it is with bit manipulation....
+ // return (ch >= 0xD800 && ch <= 0xDBFF);
+ // A high surrogate has the bit pattern:
+ // 110110xx xxxxxxxx
+ // ch & 0xFC00 does a bit-mask of the most significant 6 bits (110110)
+ // return 0xD800 == (ch & 0xFC00);
+ // as it happens, it is faster to do a bit-shift,
+ return 0x36 == ch >>> 10;
+ }
+
+ /**
+ * This is a function for determining whether the
+ * specified character is the low 16 bits in a
+ * UTF-16 surrogate pair.
+ * @param ch character to check
+ * @return true if the character is a low surrogate, false otherwise.
+ */
+ public static boolean isLowSurrogate(final char ch) {
+ // faster way to do it is with bit manipulation....
+ // return (ch >= 0xDC00 && ch <= 0xDFFF);
+ return 0x37 == ch >>> 10;
+ }
+
+ /**
+ * <p>
+ * This is a utility function for determining whether
+ * a specified Unicode character is legal in URI references
+ * as determined by RFC 2396.
+ * </p>
+ *
+ * @param c <code>char</code> to check for URI reference compliance.
+ * @return true if it's allowed, false otherwise.
+ */
+ public static boolean isURICharacter(final char c) {
+ if (c >= 'a' && c <= 'z') return true;
+ if (c >= 'A' && c <= 'Z') return true;
+ if (c >= '0' && c <= '9') return true;
+ if (c == '/') return true;
+ if (c == '-') return true;
+ if (c == '.') return true;
+ if (c == '?') return true;
+ if (c == ':') return true;
+ if (c == '@') return true;
+ if (c == '&') return true;
+ if (c == '=') return true;
+ if (c == '+') return true;
+ if (c == '$') return true;
+ if (c == ',') return true;
+ if (c == '%') return true;
+
+ if (c == '_') return true;
+ if (c == '!') return true;
+ if (c == '~') return true;
+ if (c == '*') return true;
+ if (c == '\'') return true;
+ if (c == '(') return true;
+ if (c == ')') return true;
+ return false;
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a character according to production 2 of the
+ * XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML compliance
+ * @return <code>boolean</code> true if it's a character,
+ * false otherwise
+ */
+ public static boolean isXMLCharacter(final int c) {
+
+ if (c == '\n') return true;
+ if (c == '\r') return true;
+ if (c == '\t') return true;
+
+ if (c < 0x20) return false; if (c <= 0xD7FF) return true;
+ if (c < 0xE000) return false; if (c <= 0xFFFD) return true;
+ if (c < 0x10000) return false; if (c <= 0x10FFFF) return true;
+
+ return false;
+ }
+
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a name character according to production 4 of the
+ * XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML name compliance.
+ * @return <code>boolean</code> true if it's a name character,
+ * false otherwise.
+ */
+ public static boolean isXMLNameCharacter(final char c) {
+
+ return (isXMLLetter(c) || isXMLDigit(c) || c == '.' || c == '-'
+ || c == '_' || c == ':' || isXMLCombiningChar(c)
+ || isXMLExtender(c));
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a legal name start character according to production 5
+ * of the XML 1.0 specification. This production does allow names
+ * to begin with colons which the Namespaces in XML Recommendation
+ * disallows.
+ *
+ * @param c <code>char</code> to check for XML name start compliance.
+ * @return <code>boolean</code> true if it's a name start character,
+ * false otherwise.
+ */
+ public static boolean isXMLNameStartCharacter(final char c) {
+
+ return (isXMLLetter(c) || c == '_' || c ==':');
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a letter or digit according to productions 84 and 88
+ * of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check.
+ * @return <code>boolean</code> true if it's letter or digit,
+ * false otherwise.
+ */
+ public static boolean isXMLLetterOrDigit(final char c) {
+
+ return (isXMLLetter(c) || isXMLDigit(c));
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified character
+ * is a letter according to production 84 of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML name compliance.
+ * @return <code>String</code> true if it's a letter, false otherwise.
+ */
+ public static boolean isXMLLetter(final char c) {
+ // Note that order is very important here. The search proceeds
+ // from lowest to highest values, so that no searching occurs
+ // above the character's value. BTW, the first line is equivalent to:
+ // if (c >= 0x0041 && c <= 0x005A) return true;
+
+ if (c < 0x0041) return false; if (c <= 0x005a) return true;
+ if (c < 0x0061) return false; if (c <= 0x007A) return true;
+ if (c < 0x00C0) return false; if (c <= 0x00D6) return true;
+ if (c < 0x00D8) return false; if (c <= 0x00F6) return true;
+ if (c < 0x00F8) return false; if (c <= 0x00FF) return true;
+ if (c < 0x0100) return false; if (c <= 0x0131) return true;
+ if (c < 0x0134) return false; if (c <= 0x013E) return true;
+ if (c < 0x0141) return false; if (c <= 0x0148) return true;
+ if (c < 0x014A) return false; if (c <= 0x017E) return true;
+ if (c < 0x0180) return false; if (c <= 0x01C3) return true;
+ if (c < 0x01CD) return false; if (c <= 0x01F0) return true;
+ if (c < 0x01F4) return false; if (c <= 0x01F5) return true;
+ if (c < 0x01FA) return false; if (c <= 0x0217) return true;
+ if (c < 0x0250) return false; if (c <= 0x02A8) return true;
+ if (c < 0x02BB) return false; if (c <= 0x02C1) return true;
+ if (c == 0x0386) return true;
+ if (c < 0x0388) return false; if (c <= 0x038A) return true;
+ if (c == 0x038C) return true;
+ if (c < 0x038E) return false; if (c <= 0x03A1) return true;
+ if (c < 0x03A3) return false; if (c <= 0x03CE) return true;
+ if (c < 0x03D0) return false; if (c <= 0x03D6) return true;
+ if (c == 0x03DA) return true;
+ if (c == 0x03DC) return true;
+ if (c == 0x03DE) return true;
+ if (c == 0x03E0) return true;
+ if (c < 0x03E2) return false; if (c <= 0x03F3) return true;
+ if (c < 0x0401) return false; if (c <= 0x040C) return true;
+ if (c < 0x040E) return false; if (c <= 0x044F) return true;
+ if (c < 0x0451) return false; if (c <= 0x045C) return true;
+ if (c < 0x045E) return false; if (c <= 0x0481) return true;
+ if (c < 0x0490) return false; if (c <= 0x04C4) return true;
+ if (c < 0x04C7) return false; if (c <= 0x04C8) return true;
+ if (c < 0x04CB) return false; if (c <= 0x04CC) return true;
+ if (c < 0x04D0) return false; if (c <= 0x04EB) return true;
+ if (c < 0x04EE) return false; if (c <= 0x04F5) return true;
+ if (c < 0x04F8) return false; if (c <= 0x04F9) return true;
+ if (c < 0x0531) return false; if (c <= 0x0556) return true;
+ if (c == 0x0559) return true;
+ if (c < 0x0561) return false; if (c <= 0x0586) return true;
+ if (c < 0x05D0) return false; if (c <= 0x05EA) return true;
+ if (c < 0x05F0) return false; if (c <= 0x05F2) return true;
+ if (c < 0x0621) return false; if (c <= 0x063A) return true;
+ if (c < 0x0641) return false; if (c <= 0x064A) return true;
+ if (c < 0x0671) return false; if (c <= 0x06B7) return true;
+ if (c < 0x06BA) return false; if (c <= 0x06BE) return true;
+ if (c < 0x06C0) return false; if (c <= 0x06CE) return true;
+ if (c < 0x06D0) return false; if (c <= 0x06D3) return true;
+ if (c == 0x06D5) return true;
+ if (c < 0x06E5) return false; if (c <= 0x06E6) return true;
+ if (c < 0x0905) return false; if (c <= 0x0939) return true;
+ if (c == 0x093D) return true;
+ if (c < 0x0958) return false; if (c <= 0x0961) return true;
+ if (c < 0x0985) return false; if (c <= 0x098C) return true;
+ if (c < 0x098F) return false; if (c <= 0x0990) return true;
+ if (c < 0x0993) return false; if (c <= 0x09A8) return true;
+ if (c < 0x09AA) return false; if (c <= 0x09B0) return true;
+ if (c == 0x09B2) return true;
+ if (c < 0x09B6) return false; if (c <= 0x09B9) return true;
+ if (c < 0x09DC) return false; if (c <= 0x09DD) return true;
+ if (c < 0x09DF) return false; if (c <= 0x09E1) return true;
+ if (c < 0x09F0) return false; if (c <= 0x09F1) return true;
+ if (c < 0x0A05) return false; if (c <= 0x0A0A) return true;
+ if (c < 0x0A0F) return false; if (c <= 0x0A10) return true;
+ if (c < 0x0A13) return false; if (c <= 0x0A28) return true;
+ if (c < 0x0A2A) return false; if (c <= 0x0A30) return true;
+ if (c < 0x0A32) return false; if (c <= 0x0A33) return true;
+ if (c < 0x0A35) return false; if (c <= 0x0A36) return true;
+ if (c < 0x0A38) return false; if (c <= 0x0A39) return true;
+ if (c < 0x0A59) return false; if (c <= 0x0A5C) return true;
+ if (c == 0x0A5E) return true;
+ if (c < 0x0A72) return false; if (c <= 0x0A74) return true;
+ if (c < 0x0A85) return false; if (c <= 0x0A8B) return true;
+ if (c == 0x0A8D) return true;
+ if (c < 0x0A8F) return false; if (c <= 0x0A91) return true;
+ if (c < 0x0A93) return false; if (c <= 0x0AA8) return true;
+ if (c < 0x0AAA) return false; if (c <= 0x0AB0) return true;
+ if (c < 0x0AB2) return false; if (c <= 0x0AB3) return true;
+ if (c < 0x0AB5) return false; if (c <= 0x0AB9) return true;
+ if (c == 0x0ABD) return true;
+ if (c == 0x0AE0) return true;
+ if (c < 0x0B05) return false; if (c <= 0x0B0C) return true;
+ if (c < 0x0B0F) return false; if (c <= 0x0B10) return true;
+ if (c < 0x0B13) return false; if (c <= 0x0B28) return true;
+ if (c < 0x0B2A) return false; if (c <= 0x0B30) return true;
+ if (c < 0x0B32) return false; if (c <= 0x0B33) return true;
+ if (c < 0x0B36) return false; if (c <= 0x0B39) return true;
+ if (c == 0x0B3D) return true;
+ if (c < 0x0B5C) return false; if (c <= 0x0B5D) return true;
+ if (c < 0x0B5F) return false; if (c <= 0x0B61) return true;
+ if (c < 0x0B85) return false; if (c <= 0x0B8A) return true;
+ if (c < 0x0B8E) return false; if (c <= 0x0B90) return true;
+ if (c < 0x0B92) return false; if (c <= 0x0B95) return true;
+ if (c < 0x0B99) return false; if (c <= 0x0B9A) return true;
+ if (c == 0x0B9C) return true;
+ if (c < 0x0B9E) return false; if (c <= 0x0B9F) return true;
+ if (c < 0x0BA3) return false; if (c <= 0x0BA4) return true;
+ if (c < 0x0BA8) return false; if (c <= 0x0BAA) return true;
+ if (c < 0x0BAE) return false; if (c <= 0x0BB5) return true;
+ if (c < 0x0BB7) return false; if (c <= 0x0BB9) return true;
+ if (c < 0x0C05) return false; if (c <= 0x0C0C) return true;
+ if (c < 0x0C0E) return false; if (c <= 0x0C10) return true;
+ if (c < 0x0C12) return false; if (c <= 0x0C28) return true;
+ if (c < 0x0C2A) return false; if (c <= 0x0C33) return true;
+ if (c < 0x0C35) return false; if (c <= 0x0C39) return true;
+ if (c < 0x0C60) return false; if (c <= 0x0C61) return true;
+ if (c < 0x0C85) return false; if (c <= 0x0C8C) return true;
+ if (c < 0x0C8E) return false; if (c <= 0x0C90) return true;
+ if (c < 0x0C92) return false; if (c <= 0x0CA8) return true;
+ if (c < 0x0CAA) return false; if (c <= 0x0CB3) return true;
+ if (c < 0x0CB5) return false; if (c <= 0x0CB9) return true;
+ if (c == 0x0CDE) return true;
+ if (c < 0x0CE0) return false; if (c <= 0x0CE1) return true;
+ if (c < 0x0D05) return false; if (c <= 0x0D0C) return true;
+ if (c < 0x0D0E) return false; if (c <= 0x0D10) return true;
+ if (c < 0x0D12) return false; if (c <= 0x0D28) return true;
+ if (c < 0x0D2A) return false; if (c <= 0x0D39) return true;
+ if (c < 0x0D60) return false; if (c <= 0x0D61) return true;
+ if (c < 0x0E01) return false; if (c <= 0x0E2E) return true;
+ if (c == 0x0E30) return true;
+ if (c < 0x0E32) return false; if (c <= 0x0E33) return true;
+ if (c < 0x0E40) return false; if (c <= 0x0E45) return true;
+ if (c < 0x0E81) return false; if (c <= 0x0E82) return true;
+ if (c == 0x0E84) return true;
+ if (c < 0x0E87) return false; if (c <= 0x0E88) return true;
+ if (c == 0x0E8A) return true;
+ if (c == 0x0E8D) return true;
+ if (c < 0x0E94) return false; if (c <= 0x0E97) return true;
+ if (c < 0x0E99) return false; if (c <= 0x0E9F) return true;
+ if (c < 0x0EA1) return false; if (c <= 0x0EA3) return true;
+ if (c == 0x0EA5) return true;
+ if (c == 0x0EA7) return true;
+ if (c < 0x0EAA) return false; if (c <= 0x0EAB) return true;
+ if (c < 0x0EAD) return false; if (c <= 0x0EAE) return true;
+ if (c == 0x0EB0) return true;
+ if (c < 0x0EB2) return false; if (c <= 0x0EB3) return true;
+ if (c == 0x0EBD) return true;
+ if (c < 0x0EC0) return false; if (c <= 0x0EC4) return true;
+ if (c < 0x0F40) return false; if (c <= 0x0F47) return true;
+ if (c < 0x0F49) return false; if (c <= 0x0F69) return true;
+ if (c < 0x10A0) return false; if (c <= 0x10C5) return true;
+ if (c < 0x10D0) return false; if (c <= 0x10F6) return true;
+ if (c == 0x1100) return true;
+ if (c < 0x1102) return false; if (c <= 0x1103) return true;
+ if (c < 0x1105) return false; if (c <= 0x1107) return true;
+ if (c == 0x1109) return true;
+ if (c < 0x110B) return false; if (c <= 0x110C) return true;
+ if (c < 0x110E) return false; if (c <= 0x1112) return true;
+ if (c == 0x113C) return true;
+ if (c == 0x113E) return true;
+ if (c == 0x1140) return true;
+ if (c == 0x114C) return true;
+ if (c == 0x114E) return true;
+ if (c == 0x1150) return true;
+ if (c < 0x1154) return false; if (c <= 0x1155) return true;
+ if (c == 0x1159) return true;
+ if (c < 0x115F) return false; if (c <= 0x1161) return true;
+ if (c == 0x1163) return true;
+ if (c == 0x1165) return true;
+ if (c == 0x1167) return true;
+ if (c == 0x1169) return true;
+ if (c < 0x116D) return false; if (c <= 0x116E) return true;
+ if (c < 0x1172) return false; if (c <= 0x1173) return true;
+ if (c == 0x1175) return true;
+ if (c == 0x119E) return true;
+ if (c == 0x11A8) return true;
+ if (c == 0x11AB) return true;
+ if (c < 0x11AE) return false; if (c <= 0x11AF) return true;
+ if (c < 0x11B7) return false; if (c <= 0x11B8) return true;
+ if (c == 0x11BA) return true;
+ if (c < 0x11BC) return false; if (c <= 0x11C2) return true;
+ if (c == 0x11EB) return true;
+ if (c == 0x11F0) return true;
+ if (c == 0x11F9) return true;
+ if (c < 0x1E00) return false; if (c <= 0x1E9B) return true;
+ if (c < 0x1EA0) return false; if (c <= 0x1EF9) return true;
+ if (c < 0x1F00) return false; if (c <= 0x1F15) return true;
+ if (c < 0x1F18) return false; if (c <= 0x1F1D) return true;
+ if (c < 0x1F20) return false; if (c <= 0x1F45) return true;
+ if (c < 0x1F48) return false; if (c <= 0x1F4D) return true;
+ if (c < 0x1F50) return false; if (c <= 0x1F57) return true;
+ if (c == 0x1F59) return true;
+ if (c == 0x1F5B) return true;
+ if (c == 0x1F5D) return true;
+ if (c < 0x1F5F) return false; if (c <= 0x1F7D) return true;
+ if (c < 0x1F80) return false; if (c <= 0x1FB4) return true;
+ if (c < 0x1FB6) return false; if (c <= 0x1FBC) return true;
+ if (c == 0x1FBE) return true;
+ if (c < 0x1FC2) return false; if (c <= 0x1FC4) return true;
+ if (c < 0x1FC6) return false; if (c <= 0x1FCC) return true;
+ if (c < 0x1FD0) return false; if (c <= 0x1FD3) return true;
+ if (c < 0x1FD6) return false; if (c <= 0x1FDB) return true;
+ if (c < 0x1FE0) return false; if (c <= 0x1FEC) return true;
+ if (c < 0x1FF2) return false; if (c <= 0x1FF4) return true;
+ if (c < 0x1FF6) return false; if (c <= 0x1FFC) return true;
+ if (c == 0x2126) return true;
+ if (c < 0x212A) return false; if (c <= 0x212B) return true;
+ if (c == 0x212E) return true;
+ if (c < 0x2180) return false; if (c <= 0x2182) return true;
+ if (c == 0x3007) return true; // ideographic
+ if (c < 0x3021) return false; if (c <= 0x3029) return true; // ideo
+ if (c < 0x3041) return false; if (c <= 0x3094) return true;
+ if (c < 0x30A1) return false; if (c <= 0x30FA) return true;
+ if (c < 0x3105) return false; if (c <= 0x312C) return true;
+ if (c < 0x4E00) return false; if (c <= 0x9FA5) return true; // ideo
+ if (c < 0xAC00) return false; if (c <= 0xD7A3) return true;
+
+ return false;
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified character
+ * is a combining character according to production 87
+ * of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check.
+ * @return <code>boolean</code> true if it's a combining character,
+ * false otherwise.
+ */
+ public static boolean isXMLCombiningChar(final char c) {
+ // CombiningChar
+ if (c < 0x0300) return false; if (c <= 0x0345) return true;
+ if (c < 0x0360) return false; if (c <= 0x0361) return true;
+ if (c < 0x0483) return false; if (c <= 0x0486) return true;
+ if (c < 0x0591) return false; if (c <= 0x05A1) return true;
+
+ if (c < 0x05A3) return false; if (c <= 0x05B9) return true;
+ if (c < 0x05BB) return false; if (c <= 0x05BD) return true;
+ if (c == 0x05BF) return true;
+ if (c < 0x05C1) return false; if (c <= 0x05C2) return true;
+
+ if (c == 0x05C4) return true;
+ if (c < 0x064B) return false; if (c <= 0x0652) return true;
+ if (c == 0x0670) return true;
+ if (c < 0x06D6) return false; if (c <= 0x06DC) return true;
+
+ if (c < 0x06DD) return false; if (c <= 0x06DF) return true;
+ if (c < 0x06E0) return false; if (c <= 0x06E4) return true;
+ if (c < 0x06E7) return false; if (c <= 0x06E8) return true;
+
+ if (c < 0x06EA) return false; if (c <= 0x06ED) return true;
+ if (c < 0x0901) return false; if (c <= 0x0903) return true;
+ if (c == 0x093C) return true;
+ if (c < 0x093E) return false; if (c <= 0x094C) return true;
+
+ if (c == 0x094D) return true;
+ if (c < 0x0951) return false; if (c <= 0x0954) return true;
+ if (c < 0x0962) return false; if (c <= 0x0963) return true;
+ if (c < 0x0981) return false; if (c <= 0x0983) return true;
+
+ if (c == 0x09BC) return true;
+ if (c == 0x09BE) return true;
+ if (c == 0x09BF) return true;
+ if (c < 0x09C0) return false; if (c <= 0x09C4) return true;
+ if (c < 0x09C7) return false; if (c <= 0x09C8) return true;
+
+ if (c < 0x09CB) return false; if (c <= 0x09CD) return true;
+ if (c == 0x09D7) return true;
+ if (c < 0x09E2) return false; if (c <= 0x09E3) return true;
+ if (c == 0x0A02) return true;
+ if (c == 0x0A3C) return true;
+
+ if (c == 0x0A3E) return true;
+ if (c == 0x0A3F) return true;
+ if (c < 0x0A40) return false; if (c <= 0x0A42) return true;
+ if (c < 0x0A47) return false; if (c <= 0x0A48) return true;
+
+ if (c < 0x0A4B) return false; if (c <= 0x0A4D) return true;
+ if (c < 0x0A70) return false; if (c <= 0x0A71) return true;
+ if (c < 0x0A81) return false; if (c <= 0x0A83) return true;
+ if (c == 0x0ABC) return true;
+
+ if (c < 0x0ABE) return false; if (c <= 0x0AC5) return true;
+ if (c < 0x0AC7) return false; if (c <= 0x0AC9) return true;
+ if (c < 0x0ACB) return false; if (c <= 0x0ACD) return true;
+
+ if (c < 0x0B01) return false; if (c <= 0x0B03) return true;
+ if (c == 0x0B3C) return true;
+ if (c < 0x0B3E) return false; if (c <= 0x0B43) return true;
+ if (c < 0x0B47) return false; if (c <= 0x0B48) return true;
+
+ if (c < 0x0B4B) return false; if (c <= 0x0B4D) return true;
+ if (c < 0x0B56) return false; if (c <= 0x0B57) return true;
+ if (c < 0x0B82) return false; if (c <= 0x0B83) return true;
+
+ if (c < 0x0BBE) return false; if (c <= 0x0BC2) return true;
+ if (c < 0x0BC6) return false; if (c <= 0x0BC8) return true;
+ if (c < 0x0BCA) return false; if (c <= 0x0BCD) return true;
+ if (c == 0x0BD7) return true;
+
+ if (c < 0x0C01) return false; if (c <= 0x0C03) return true;
+ if (c < 0x0C3E) return false; if (c <= 0x0C44) return true;
+ if (c < 0x0C46) return false; if (c <= 0x0C48) return true;
+
+ if (c < 0x0C4A) return false; if (c <= 0x0C4D) return true;
+ if (c < 0x0C55) return false; if (c <= 0x0C56) return true;
+ if (c < 0x0C82) return false; if (c <= 0x0C83) return true;
+
+ if (c < 0x0CBE) return false; if (c <= 0x0CC4) return true;
+ if (c < 0x0CC6) return false; if (c <= 0x0CC8) return true;
+ if (c < 0x0CCA) return false; if (c <= 0x0CCD) return true;
+
+ if (c < 0x0CD5) return false; if (c <= 0x0CD6) return true;
+ if (c < 0x0D02) return false; if (c <= 0x0D03) return true;
+ if (c < 0x0D3E) return false; if (c <= 0x0D43) return true;
+
+ if (c < 0x0D46) return false; if (c <= 0x0D48) return true;
+ if (c < 0x0D4A) return false; if (c <= 0x0D4D) return true;
+ if (c == 0x0D57) return true;
+ if (c == 0x0E31) return true;
+
+ if (c < 0x0E34) return false; if (c <= 0x0E3A) return true;
+ if (c < 0x0E47) return false; if (c <= 0x0E4E) return true;
+ if (c == 0x0EB1) return true;
+ if (c < 0x0EB4) return false; if (c <= 0x0EB9) return true;
+
+ if (c < 0x0EBB) return false; if (c <= 0x0EBC) return true;
+ if (c < 0x0EC8) return false; if (c <= 0x0ECD) return true;
+ if (c < 0x0F18) return false; if (c <= 0x0F19) return true;
+ if (c == 0x0F35) return true;
+
+ if (c == 0x0F37) return true;
+ if (c == 0x0F39) return true;
+ if (c == 0x0F3E) return true;
+ if (c == 0x0F3F) return true;
+ if (c < 0x0F71) return false; if (c <= 0x0F84) return true;
+
+ if (c < 0x0F86) return false; if (c <= 0x0F8B) return true;
+ if (c < 0x0F90) return false; if (c <= 0x0F95) return true;
+ if (c == 0x0F97) return true;
+ if (c < 0x0F99) return false; if (c <= 0x0FAD) return true;
+
+ if (c < 0x0FB1) return false; if (c <= 0x0FB7) return true;
+ if (c == 0x0FB9) return true;
+ if (c < 0x20D0) return false; if (c <= 0x20DC) return true;
+ if (c == 0x20E1) return true;
+
+ if (c < 0x302A) return false; if (c <= 0x302F) return true;
+ if (c == 0x3099) return true;
+ if (c == 0x309A) return true;
+
+ return false;
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is an extender according to production 88 of the XML 1.0
+ * specification.
+ *
+ * @param c <code>char</code> to check.
+ * @return <code>String</code> true if it's an extender, false otherwise.
+ */
+ public static boolean isXMLExtender(final char c) {
+
+ if (c < 0x00B6) return false; // quick short circuit
+
+ // Extenders
+ if (c == 0x00B7) return true;
+ if (c == 0x02D0) return true;
+ if (c == 0x02D1) return true;
+ if (c == 0x0387) return true;
+ if (c == 0x0640) return true;
+ if (c == 0x0E46) return true;
+ if (c == 0x0EC6) return true;
+ if (c == 0x3005) return true;
+
+ if (c < 0x3031) return false; if (c <= 0x3035) return true;
+ if (c < 0x309D) return false; if (c <= 0x309E) return true;
+ if (c < 0x30FC) return false; if (c <= 0x30FE) return true;
+
+ return false;
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * Unicode character
+ * is a digit according to production 88 of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML digit compliance
+ * @return <code>boolean</code> true if it's a digit, false otherwise
+ */
+ public static boolean isXMLDigit(final char c) {
+
+ if (c < 0x0030) return false; if (c <= 0x0039) return true;
+ if (c < 0x0660) return false; if (c <= 0x0669) return true;
+ if (c < 0x06F0) return false; if (c <= 0x06F9) return true;
+ if (c < 0x0966) return false; if (c <= 0x096F) return true;
+
+ if (c < 0x09E6) return false; if (c <= 0x09EF) return true;
+ if (c < 0x0A66) return false; if (c <= 0x0A6F) return true;
+ if (c < 0x0AE6) return false; if (c <= 0x0AEF) return true;
+
+ if (c < 0x0B66) return false; if (c <= 0x0B6F) return true;
+ if (c < 0x0BE7) return false; if (c <= 0x0BEF) return true;
+ if (c < 0x0C66) return false; if (c <= 0x0C6F) return true;
+
+ if (c < 0x0CE6) return false; if (c <= 0x0CEF) return true;
+ if (c < 0x0D66) return false; if (c <= 0x0D6F) return true;
+ if (c < 0x0E50) return false; if (c <= 0x0E59) return true;
+
+ if (c < 0x0ED0) return false; if (c <= 0x0ED9) return true;
+ if (c < 0x0F20) return false; if (c <= 0x0F29) return true;
+
+ return false;
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * Unicode character is a whitespace character according to production 3
+ * of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML whitespace compliance
+ * @return <code>boolean</code> true if it's a whitespace, false otherwise
+ */
+ public static boolean isXMLWhitespace(final char c) {
+ // the following if is faster than switch statements.
+ // seems the implicit conversion to int is slower than
+ // the fall-through or's
+ if (c==' ' || c=='\n' || c=='\t' || c=='\r' ){
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * String is a whitespace character according to production 3
+ * of the XML 1.0 specification.
+ * <p>
+ * This method delegates the individual calls for each character to
+ * {@link #isXMLWhitespace(char)}.
+ *
+ * @param value
+ * The value to inspect
+ * @return true if all characters in the input value are all whitespace
+ * (or the string is the empty-string).
+ * @since JDOM2
+ */
+ public static final boolean isAllXMLWhitespace(final String value) {
+ // Doing the count-down instead of a count-up saves a single int
+ // variable declaration.
+ int i = value.length();
+ while (--i >= 0) {
+ if (!isXMLWhitespace(value.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+}
View
696 contrib/src/java/org/jdom2/contrib/verifier/VerifierBuilder.java
@@ -0,0 +1,696 @@
+/*--
+
+ Copyright (C) 2000-2012 Jason Hunter & Brett McLaughlin.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the disclaimer that follows
+ these conditions in the documentation and/or other materials
+ provided with the distribution.
+
+ 3. The name "JDOM" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact <request_AT_jdom_DOT_org>.
+
+ 4. Products derived from this software may not be called "JDOM", nor
+ may "JDOM" appear in their name, without prior written permission
+ from the JDOM Project Management <request_AT_jdom_DOT_org>.
+
+ In addition, we request (but do not require) that you include in the
+ end-user documentation provided with the redistribution and/or in the
+ software itself an acknowledgement equivalent to the following:
+ "This product includes software developed by the
+ JDOM Project (http://www.jdom.org/)."
+ Alternatively, the acknowledgment may be graphical using the logos
+ available at http://www.jdom.org/images/logos.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the JDOM Project and was originally
+ created by Jason Hunter <jhunter_AT_jdom_DOT_org> and
+ Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information
+ on the JDOM Project, please see <http://www.jdom.org/>.
+
+ */
+
+package org.jdom2.contrib.verifier;
+
+import org.jdom2.Verifier;
+
+/**
+ * A utility class to build the data component of the main
+ * org.jdom2.Verifier class. This class contains all the character
+ * identification/classification routines.
+ * <p>
+ * This class is based on the content of the main Verifier.java class
+ * prior to this optimization.
+ *
+ * @author Brett McLaughlin
+ * @author Elliotte Rusty Harold
+ * @author Jason Hunter
+ * @author Bradley S. Huffman
+ * @author Rolf Lear
+ */
+final public class VerifierBuilder {
+
+ /**
+ * Ensure instantation cannot occur.
+ */
+ private VerifierBuilder() { }
+
+
+ private static final int charcnt = Character.MAX_VALUE + 1;
+ private static final byte maskxmlcharacter = 1 << 0;
+ private static final byte maskxmlletter = 1 << 1;
+ private static final byte maskxmlstart = 1 << 2;
+ private static final byte maskxmlnamecharacter = 1 << 3;
+ private static final byte maskxmldigit = 1 << 4;
+ private static final byte maskxmlcombining = 1 << 5;
+ private static final byte maskuricharacter = 1 << 6;
+
+ @SuppressWarnings("javadoc")
+ public static void main(String[] args) {
+ // populate the flags array.
+ final byte[] flags = new byte[charcnt];
+
+ for (int i = 0; i < charcnt; i++) {
+ if (isXMLCharacter(i)) {
+ flags[i] |= maskxmlcharacter;
+ }
+
+ final char c = (char)i;
+
+ if (isXMLLetter(c)) {
+ flags[i] |= maskxmlletter;
+ }
+ if (isXMLNameCharacter(c)) {
+ flags[i] |= maskxmlnamecharacter;
+ }
+ if (isXMLNameStartCharacter(c)) {
+ flags[i] |= maskxmlstart;
+ }
+ if (isXMLDigit(c)) {
+ flags[i] |= maskxmldigit;
+ }
+ if (isXMLCombiningChar(c)) {
+ flags[i] |= maskxmlcombining;
+ }
+ if (isURICharacter(c)) {
+ flags[i] |= maskuricharacter;
+ }
+ }
+
+ // OK, now 'condense' the flags array to something usable.
+ byte[] vals = new byte[flags.length];
+ int[] lens = new int[flags.length];
+ int index = 0;
+ byte val = flags[0];
+ int cnt = 0;
+
+ for (int i = 0; i < flags.length; i++) {
+ if (flags[i] == val) {
+ cnt++;
+ } else {
+ vals[index] = val;
+ lens[index] = cnt;
+ val = flags[i];
+ cnt = 1;
+ index++;
+ }
+ }
+ vals[index] = val;
+ lens[index] = cnt;
+ index++;
+
+ int ci = 0;
+ for (int i = 0; i < index; i++) {
+ int l = lens[i];
+ final byte v = vals[i];
+ while (--l >= 0) {
+ if (flags[ci] != v) {
+ throw new IllegalStateException(String.format(
+ "Failed to calculate byte 0x%02x at index %d. Calculated 0x%02x instead.",
+ flags[ci], ci, v));
+ }
+ ci++;
+ }
+ }
+
+ System.out.println("There are " + index + " transitions.");
+
+ StringBuilder sbval = new StringBuilder();
+ StringBuilder sblen = new StringBuilder();
+
+ sbval.append("private static final byte[] VALCONST = new byte[] {");
+ sblen.append("private static final int [] LENCONST = new int [] {");
+
+ for (int i = 0; i < index; i++) {
+ if (i > 0) {
+ sbval.append(", ");
+ sblen.append(", ");
+ }
+ if ((i % 8) == 0) {
+ sbval.append("\n ");
+ sblen.append("\n ");
+ }
+ sbval.append(String.format("0x%02x", vals[i]));
+ sblen.append(String.format("%5d", lens[i]));
+ }
+
+ sbval.append("};\n");
+ sblen.append("};\n");
+
+ System.out.println(sbval.toString());
+ System.out.println(sblen.toString());
+
+ }
+
+
+ /**
+ * <p>
+ * This is a utility function for determining whether a specified
+ * Unicode character is a hexadecimal digit as defined in RFC 2396;
+ * that is, one of the ASCII characters 0-9, a-f, or A-F.
+ * </p>
+ *
+ * @param c to check for hex digit.
+ * @return true if it's allowed, false otherwise.
+ */
+ public static boolean isHexDigit(final char c) {
+
+ // I suspect most characters passed to this method will be
+ // correct hexadecimal digits, so I test for the true cases
+ // first. If this proves to be a performance bottleneck
+ // a switch statement or lookup table
+ // might optimize this.
+ if (c >= '0' && c <= '9') return true;
+ if (c >= 'A' && c <= 'F') return true;
+ if (c >= 'a' && c <= 'f') return true;
+
+ return false;
+ }
+
+ /**
+ * <p>
+ * This is a utility function for determining whether
+ * a specified Unicode character is legal in URI references
+ * as determined by RFC 2396.
+ * </p>
+ *
+ * @param c <code>char</code> to check for URI reference compliance.
+ * @return true if it's allowed, false otherwise.
+ */
+ public static boolean isURICharacter(final char c) {
+ if (c >= 'a' && c <= 'z') return true;
+ if (c >= 'A' && c <= 'Z') return true;
+ if (c >= '0' && c <= '9') return true;
+ if (c == '/') return true;
+ if (c == '-') return true;
+ if (c == '.') return true;
+ if (c == '?') return true;
+ if (c == ':') return true;
+ if (c == '@') return true;
+ if (c == '&') return true;
+ if (c == '=') return true;
+ if (c == '+') return true;
+ if (c == '$') return true;
+ if (c == ',') return true;
+ if (c == '%') return true;
+
+ if (c == '_') return true;
+ if (c == '!') return true;
+ if (c == '~') return true;
+ if (c == '*') return true;
+ if (c == '\'') return true;
+ if (c == '(') return true;
+ if (c == ')') return true;
+ return false;
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a character according to production 2 of the
+ * XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML compliance
+ * @return <code>boolean</code> true if it's a character,
+ * false otherwise
+ */
+ public static boolean isXMLCharacter(final int c) {
+
+ if (c == '\n') return true;
+ if (c == '\r') return true;
+ if (c == '\t') return true;
+
+ if (c < 0x20) return false; if (c <= 0xD7FF) return true;
+ if (c < 0xE000) return false; if (c <= 0xFFFD) return true;
+ if (c < 0x10000) return false; if (c <= 0x10FFFF) return true;
+
+ return false;
+ }
+
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a name character according to production 4 of the
+ * XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML name compliance.
+ * @return <code>boolean</code> true if it's a name character,
+ * false otherwise.
+ */
+ public static boolean isXMLNameCharacter(final char c) {
+
+ return (isXMLLetter(c) || isXMLDigit(c) || c == '.' || c == '-'
+ || c == '_' || c == ':' || isXMLCombiningChar(c)
+ || Verifier.isXMLExtender(c));
+ }
+
+ /**
+ * This is a utility function for determining whether a specified
+ * character is a legal name start character according to production 5
+ * of the XML 1.0 specification. This production does allow names
+ * to begin with colons which the Namespaces in XML Recommendation
+ * disallows.
+ *
+ * @param c <code>char</code> to check for XML name start compliance.
+ * @return <code>boolean</code> true if it's a name start character,
+ * false otherwise.
+ */
+ public static boolean isXMLNameStartCharacter(final char c) {
+
+ return (isXMLLetter(c) || c == '_' || c ==':');
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified character
+ * is a letter according to production 84 of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check for XML name compliance.
+ * @return <code>String</code> true if it's a letter, false otherwise.
+ */
+ public static boolean isXMLLetter(final char c) {
+ // Note that order is very important here. The search proceeds
+ // from lowest to highest values, so that no searching occurs
+ // above the character's value. BTW, the first line is equivalent to:
+ // if (c >= 0x0041 && c <= 0x005A) return true;
+
+ if (c < 0x0041) return false; if (c <= 0x005a) return true;
+ if (c < 0x0061) return false; if (c <= 0x007A) return true;
+ if (c < 0x00C0) return false; if (c <= 0x00D6) return true;
+ if (c < 0x00D8) return false; if (c <= 0x00F6) return true;
+ if (c < 0x00F8) return false; if (c <= 0x00FF) return true;
+ if (c < 0x0100) return false; if (c <= 0x0131) return true;
+ if (c < 0x0134) return false; if (c <= 0x013E) return true;
+ if (c < 0x0141) return false; if (c <= 0x0148) return true;
+ if (c < 0x014A) return false; if (c <= 0x017E) return true;
+ if (c < 0x0180) return false; if (c <= 0x01C3) return true;
+ if (c < 0x01CD) return false; if (c <= 0x01F0) return true;
+ if (c < 0x01F4) return false; if (c <= 0x01F5) return true;
+ if (c < 0x01FA) return false; if (c <= 0x0217) return true;
+ if (c < 0x0250) return false; if (c <= 0x02A8) return true;
+ if (c < 0x02BB) return false; if (c <= 0x02C1) return true;
+ if (c == 0x0386) return true;
+ if (c < 0x0388) return false; if (c <= 0x038A) return true;
+ if (c == 0x038C) return true;
+ if (c < 0x038E) return false; if (c <= 0x03A1) return true;
+ if (c < 0x03A3) return false; if (c <= 0x03CE) return true;
+ if (c < 0x03D0) return false; if (c <= 0x03D6) return true;
+ if (c == 0x03DA) return true;
+ if (c == 0x03DC) return true;
+ if (c == 0x03DE) return true;
+ if (c == 0x03E0) return true;
+ if (c < 0x03E2) return false; if (c <= 0x03F3) return true;
+ if (c < 0x0401) return false; if (c <= 0x040C) return true;
+ if (c < 0x040E) return false; if (c <= 0x044F) return true;
+ if (c < 0x0451) return false; if (c <= 0x045C) return true;
+ if (c < 0x045E) return false; if (c <= 0x0481) return true;
+ if (c < 0x0490) return false; if (c <= 0x04C4) return true;
+ if (c < 0x04C7) return false; if (c <= 0x04C8) return true;
+ if (c < 0x04CB) return false; if (c <= 0x04CC) return true;
+ if (c < 0x04D0) return false; if (c <= 0x04EB) return true;
+ if (c < 0x04EE) return false; if (c <= 0x04F5) return true;
+ if (c < 0x04F8) return false; if (c <= 0x04F9) return true;
+ if (c < 0x0531) return false; if (c <= 0x0556) return true;
+ if (c == 0x0559) return true;
+ if (c < 0x0561) return false; if (c <= 0x0586) return true;
+ if (c < 0x05D0) return false; if (c <= 0x05EA) return true;
+ if (c < 0x05F0) return false; if (c <= 0x05F2) return true;
+ if (c < 0x0621) return false; if (c <= 0x063A) return true;
+ if (c < 0x0641) return false; if (c <= 0x064A) return true;
+ if (c < 0x0671) return false; if (c <= 0x06B7) return true;
+ if (c < 0x06BA) return false; if (c <= 0x06BE) return true;
+ if (c < 0x06C0) return false; if (c <= 0x06CE) return true;
+ if (c < 0x06D0) return false; if (c <= 0x06D3) return true;
+ if (c == 0x06D5) return true;
+ if (c < 0x06E5) return false; if (c <= 0x06E6) return true;
+ if (c < 0x0905) return false; if (c <= 0x0939) return true;
+ if (c == 0x093D) return true;
+ if (c < 0x0958) return false; if (c <= 0x0961) return true;
+ if (c < 0x0985) return false; if (c <= 0x098C) return true;
+ if (c < 0x098F) return false; if (c <= 0x0990) return true;
+ if (c < 0x0993) return false; if (c <= 0x09A8) return true;
+ if (c < 0x09AA) return false; if (c <= 0x09B0) return true;
+ if (c == 0x09B2) return true;
+ if (c < 0x09B6) return false; if (c <= 0x09B9) return true;
+ if (c < 0x09DC) return false; if (c <= 0x09DD) return true;
+ if (c < 0x09DF) return false; if (c <= 0x09E1) return true;
+ if (c < 0x09F0) return false; if (c <= 0x09F1) return true;
+ if (c < 0x0A05) return false; if (c <= 0x0A0A) return true;
+ if (c < 0x0A0F) return false; if (c <= 0x0A10) return true;
+ if (c < 0x0A13) return false; if (c <= 0x0A28) return true;
+ if (c < 0x0A2A) return false; if (c <= 0x0A30) return true;
+ if (c < 0x0A32) return false; if (c <= 0x0A33) return true;
+ if (c < 0x0A35) return false; if (c <= 0x0A36) return true;
+ if (c < 0x0A38) return false; if (c <= 0x0A39) return true;
+ if (c < 0x0A59) return false; if (c <= 0x0A5C) return true;
+ if (c == 0x0A5E) return true;
+ if (c < 0x0A72) return false; if (c <= 0x0A74) return true;
+ if (c < 0x0A85) return false; if (c <= 0x0A8B) return true;
+ if (c == 0x0A8D) return true;
+ if (c < 0x0A8F) return false; if (c <= 0x0A91) return true;
+ if (c < 0x0A93) return false; if (c <= 0x0AA8) return true;
+ if (c < 0x0AAA) return false; if (c <= 0x0AB0) return true;
+ if (c < 0x0AB2) return false; if (c <= 0x0AB3) return true;
+ if (c < 0x0AB5) return false; if (c <= 0x0AB9) return true;
+ if (c == 0x0ABD) return true;
+ if (c == 0x0AE0) return true;
+ if (c < 0x0B05) return false; if (c <= 0x0B0C) return true;
+ if (c < 0x0B0F) return false; if (c <= 0x0B10) return true;
+ if (c < 0x0B13) return false; if (c <= 0x0B28) return true;
+ if (c < 0x0B2A) return false; if (c <= 0x0B30) return true;
+ if (c < 0x0B32) return false; if (c <= 0x0B33) return true;
+ if (c < 0x0B36) return false; if (c <= 0x0B39) return true;
+ if (c == 0x0B3D) return true;
+ if (c < 0x0B5C) return false; if (c <= 0x0B5D) return true;
+ if (c < 0x0B5F) return false; if (c <= 0x0B61) return true;
+ if (c < 0x0B85) return false; if (c <= 0x0B8A) return true;
+ if (c < 0x0B8E) return false; if (c <= 0x0B90) return true;
+ if (c < 0x0B92) return false; if (c <= 0x0B95) return true;
+ if (c < 0x0B99) return false; if (c <= 0x0B9A) return true;
+ if (c == 0x0B9C) return true;
+ if (c < 0x0B9E) return false; if (c <= 0x0B9F) return true;
+ if (c < 0x0BA3) return false; if (c <= 0x0BA4) return true;
+ if (c < 0x0BA8) return false; if (c <= 0x0BAA) return true;
+ if (c < 0x0BAE) return false; if (c <= 0x0BB5) return true;
+ if (c < 0x0BB7) return false; if (c <= 0x0BB9) return true;
+ if (c < 0x0C05) return false; if (c <= 0x0C0C) return true;
+ if (c < 0x0C0E) return false; if (c <= 0x0C10) return true;
+ if (c < 0x0C12) return false; if (c <= 0x0C28) return true;
+ if (c < 0x0C2A) return false; if (c <= 0x0C33) return true;
+ if (c < 0x0C35) return false; if (c <= 0x0C39) return true;
+ if (c < 0x0C60) return false; if (c <= 0x0C61) return true;
+ if (c < 0x0C85) return false; if (c <= 0x0C8C) return true;
+ if (c < 0x0C8E) return false; if (c <= 0x0C90) return true;
+ if (c < 0x0C92) return false; if (c <= 0x0CA8) return true;
+ if (c < 0x0CAA) return false; if (c <= 0x0CB3) return true;
+ if (c < 0x0CB5) return false; if (c <= 0x0CB9) return true;
+ if (c == 0x0CDE) return true;
+ if (c < 0x0CE0) return false; if (c <= 0x0CE1) return true;
+ if (c < 0x0D05) return false; if (c <= 0x0D0C) return true;
+ if (c < 0x0D0E) return false; if (c <= 0x0D10) return true;
+ if (c < 0x0D12) return false; if (c <= 0x0D28) return true;
+ if (c < 0x0D2A) return false; if (c <= 0x0D39) return true;
+ if (c < 0x0D60) return false; if (c <= 0x0D61) return true;
+ if (c < 0x0E01) return false; if (c <= 0x0E2E) return true;
+ if (c == 0x0E30) return true;
+ if (c < 0x0E32) return false; if (c <= 0x0E33) return true;
+ if (c < 0x0E40) return false; if (c <= 0x0E45) return true;
+ if (c < 0x0E81) return false; if (c <= 0x0E82) return true;
+ if (c == 0x0E84) return true;
+ if (c < 0x0E87) return false; if (c <= 0x0E88) return true;
+ if (c == 0x0E8A) return true;
+ if (c == 0x0E8D) return true;
+ if (c < 0x0E94) return false; if (c <= 0x0E97) return true;
+ if (c < 0x0E99) return false; if (c <= 0x0E9F) return true;
+ if (c < 0x0EA1) return false; if (c <= 0x0EA3) return true;
+ if (c == 0x0EA5) return true;
+ if (c == 0x0EA7) return true;
+ if (c < 0x0EAA) return false; if (c <= 0x0EAB) return true;
+ if (c < 0x0EAD) return false; if (c <= 0x0EAE) return true;
+ if (c == 0x0EB0) return true;
+ if (c < 0x0EB2) return false; if (c <= 0x0EB3) return true;
+ if (c == 0x0EBD) return true;
+ if (c < 0x0EC0) return false; if (c <= 0x0EC4) return true;
+ if (c < 0x0F40) return false; if (c <= 0x0F47) return true;
+ if (c < 0x0F49) return false; if (c <= 0x0F69) return true;
+ if (c < 0x10A0) return false; if (c <= 0x10C5) return true;
+ if (c < 0x10D0) return false; if (c <= 0x10F6) return true;
+ if (c == 0x1100) return true;
+ if (c < 0x1102) return false; if (c <= 0x1103) return true;
+ if (c < 0x1105) return false; if (c <= 0x1107) return true;
+ if (c == 0x1109) return true;
+ if (c < 0x110B) return false; if (c <= 0x110C) return true;
+ if (c < 0x110E) return false; if (c <= 0x1112) return true;
+ if (c == 0x113C) return true;
+ if (c == 0x113E) return true;
+ if (c == 0x1140) return true;
+ if (c == 0x114C) return true;
+ if (c == 0x114E) return true;
+ if (c == 0x1150) return true;
+ if (c < 0x1154) return false; if (c <= 0x1155) return true;
+ if (c == 0x1159) return true;
+ if (c < 0x115F) return false; if (c <= 0x1161) return true;
+ if (c == 0x1163) return true;
+ if (c == 0x1165) return true;
+ if (c == 0x1167) return true;
+ if (c == 0x1169) return true;
+ if (c < 0x116D) return false; if (c <= 0x116E) return true;
+ if (c < 0x1172) return false; if (c <= 0x1173) return true;
+ if (c == 0x1175) return true;
+ if (c == 0x119E) return true;
+ if (c == 0x11A8) return true;
+ if (c == 0x11AB) return true;
+ if (c < 0x11AE) return false; if (c <= 0x11AF) return true;
+ if (c < 0x11B7) return false; if (c <= 0x11B8) return true;
+ if (c == 0x11BA) return true;
+ if (c < 0x11BC) return false; if (c <= 0x11C2) return true;
+ if (c == 0x11EB) return true;
+ if (c == 0x11F0) return true;
+ if (c == 0x11F9) return true;
+ if (c < 0x1E00) return false; if (c <= 0x1E9B) return true;
+ if (c < 0x1EA0) return false; if (c <= 0x1EF9) return true;
+ if (c < 0x1F00) return false; if (c <= 0x1F15) return true;
+ if (c < 0x1F18) return false; if (c <= 0x1F1D) return true;
+ if (c < 0x1F20) return false; if (c <= 0x1F45) return true;
+ if (c < 0x1F48) return false; if (c <= 0x1F4D) return true;
+ if (c < 0x1F50) return false; if (c <= 0x1F57) return true;
+ if (c == 0x1F59) return true;
+ if (c == 0x1F5B) return true;
+ if (c == 0x1F5D) return true;
+ if (c < 0x1F5F) return false; if (c <= 0x1F7D) return true;
+ if (c < 0x1F80) return false; if (c <= 0x1FB4) return true;
+ if (c < 0x1FB6) return false; if (c <= 0x1FBC) return true;
+ if (c == 0x1FBE) return true;
+ if (c < 0x1FC2) return false; if (c <= 0x1FC4) return true;
+ if (c < 0x1FC6) return false; if (c <= 0x1FCC) return true;
+ if (c < 0x1FD0) return false; if (c <= 0x1FD3) return true;
+ if (c < 0x1FD6) return false; if (c <= 0x1FDB) return true;
+ if (c < 0x1FE0) return false; if (c <= 0x1FEC) return true;
+ if (c < 0x1FF2) return false; if (c <= 0x1FF4) return true;
+ if (c < 0x1FF6) return false; if (c <= 0x1FFC) return true;
+ if (c == 0x2126) return true;
+ if (c < 0x212A) return false; if (c <= 0x212B) return true;
+ if (c == 0x212E) return true;
+ if (c < 0x2180) return false; if (c <= 0x2182) return true;
+ if (c == 0x3007) return true; // ideographic
+ if (c < 0x3021) return false; if (c <= 0x3029) return true; // ideo
+ if (c < 0x3041) return false; if (c <= 0x3094) return true;
+ if (c < 0x30A1) return false; if (c <= 0x30FA) return true;
+ if (c < 0x3105) return false; if (c <= 0x312C) return true;
+ if (c < 0x4E00) return false; if (c <= 0x9FA5) return true; // ideo
+ if (c < 0xAC00) return false; if (c <= 0xD7A3) return true;
+
+ return false;
+
+ }
+
+ /**
+ * This is a utility function for determining whether a specified character
+ * is a combining character according to production 87
+ * of the XML 1.0 specification.
+ *
+ * @param c <code>char</code> to check.
+ * @return <code>boolean</code> true if it's a combining character,
+ * false otherwise.
+ */
+ public static boolean isXMLCombiningChar(final char c) {
+ // CombiningChar
+ if (c < 0x0300) return false; if (c <= 0x0345) return true;
+ if (c < 0x0360) return false; if (c <= 0x0361) return true;
+ if (c < 0x0483) return false; if (c <= 0x0486) return true;
+ if (c < 0x0591) return false; if (c <= 0x05A1) return true;
+
+ if (c < 0x05A3) return false; if (c <= 0x05B9) return true;
+ if (c < 0x05BB) return false; if (c <= 0x05BD) return true;
+ if (c == 0x05BF) return true;
+ if (c < 0x05C1) return false; if (c <= 0x05C2) return true;
+
+ if (c == 0x05C4) return true;
+ if (c < 0x064B) return false; if (c <= 0x0652) return true;
+ if (c == 0x0670) return true;
+ if (c < 0x06D6) return false; if (c <= 0x06DC) return true;
+
+ if (c < 0x06DD) return false; if (c <= 0x06DF) return true;
+ if (c < 0x06E0) return false; if (c <= 0x06E4) return true;
+ if (c < 0x06E7) return false; if (c <= 0x06E8) return true;
+
+ if (c < 0x06EA) return false; if (c <= 0x06ED) return true;
+ if (c < 0x0901) return false; if (c <= 0x0903) return true;
+ if (c == 0x093C) return true;
+ if (c < 0x093E) return false; if (c <= 0x094C) return true;
+
+ if (c == 0x094D) return true;
+ if (c < 0x0951) return false; if (c <= 0x0954) return true;
+ if (c < 0x0962) return false; if (c <= 0x0963) return true;
+ if (c < 0x0981) return false; if (c <= 0x0983) return true;
+
+ if (c == 0x09BC) return true;
+ if (c == 0x09BE) return true;
+ if (c == 0x09BF) return true;
+ if (c < 0x09C0) return false; if (c <= 0x09C4) return true;
+ if (c < 0x09C7) return false; if (c <= 0x09C8) return true;
+
+ if (c < 0x09CB) return false; if (c <= 0x09CD) return true;
+ if (c == 0x09D7) return true;
+ if (c < 0x09E2) return false; if (c <= 0x09E3) return true;
+ if (c == 0x0A02) return true;
+ if (c == 0x0A3C) return true;
+
+ if (c == 0x0A3E) return true;
+ if (c == 0x0A3F) return true;
+ if (c < 0x0A40) return false; if (c <= 0x0A42) return true;
+ if (c < 0x0A47) return false; if (c <= 0x0A48) return true;
+
+ if (c < 0x0A4B) return false; if (c <= 0x0A4D) return true;
+ if (c < 0x0A70) return false; if (c <= 0x0A71) return true;
+ if (c < 0x0A81) return false; if (c <= 0x0A83) return true;
+ if (c == 0x0ABC) return true;
+
+ if (c < 0x0ABE) return false; if (c <= 0x0AC5) return true;
+ if (c < 0x0AC7) return false; if (c <= 0x0AC9) return true;
+ if (c < 0x0ACB) return false; if (c <= 0x0ACD) return true;
+
+ if (c < 0x0B01) return false; if (c <= 0x0B03) return true;
+ if (c == 0x0B3C) return true;
+ if (c < 0x0B3E) return false; if (c <= 0x0B43) return true;
+ if (c < 0x0B47) return false; if (c <= 0x0B48) return true;
+
+ if (c < 0x0B4B) return false; if (c <= 0x0B4D) return true;
+ if (c < 0x0B56) return false; if (c <= 0x0B57) return true;
+ if (c < 0x0B82) return false; if (c <= 0x0B83) return true;
+
+ if (c < 0x0BBE) return false; if (c <= 0x0BC2) return true;
+ if (c < 0x0BC6) return false; if (c <= 0x0BC8) return true;
+ if (c < 0x0BCA) return false; if (c <= 0x0BCD) return true;
+ if (c == 0x0BD7) return true;
+
+ if (c < 0x0C01) return false; if (c <= 0x0C03) return true;
+ if (c < 0x0C3E) return false; if (c <= 0x0C44) return true;
+ if (c < 0x0C46) return false; if (c <= 0x0C48)