Skip to content
This repository
Browse code

Implement new StAX-based reader as a new JDOM input source.

Tatu Saloranta submitted a reader before, but this implementation is
significantly different from that because, in general, the JDOM input
mechanisms do not do any formatting/restructuring as that is done by the
outputters. Thus, this implementation is a very 'thin' one. This
implementation also adds support for sourcing from an XMLEventReader, as
well as spporting the DTD event.
With the code come the tests.
  • Loading branch information...
commit 59ee0d5384843d841c9a1f5fe8dd3b8fda2c8524 1 parent 44bb5f3
Rolf rolfl authored
423 core/src/java/org/jdom2/input/DTDParser.java
... ... @@ -0,0 +1,423 @@
  1 +/*--
  2 +
  3 + Copyright (C) 2000-2011 Jason Hunter & Brett McLaughlin.
  4 + All rights reserved.
  5 +
  6 + Redistribution and use in source and binary forms, with or without
  7 + modification, are permitted provided that the following conditions
  8 + are met:
  9 +
  10 + 1. Redistributions of source code must retain the above copyright
  11 + notice, this list of conditions, and the following disclaimer.
  12 +
  13 + 2. Redistributions in binary form must reproduce the above copyright
  14 + notice, this list of conditions, and the disclaimer that follows
  15 + these conditions in the documentation and/or other materials
  16 + provided with the distribution.
  17 +
  18 + 3. The name "JDOM" must not be used to endorse or promote products
  19 + derived from this software without prior written permission. For
  20 + written permission, please contact <request_AT_jdom_DOT_org>.
  21 +
  22 + 4. Products derived from this software may not be called "JDOM", nor
  23 + may "JDOM" appear in their name, without prior written permission
  24 + from the JDOM Project Management <request_AT_jdom_DOT_org>.
  25 +
  26 + In addition, we request (but do not require) that you include in the
  27 + end-user documentation provided with the redistribution and/or in the
  28 + software itself an acknowledgement equivalent to the following:
  29 + "This product includes software developed by the
  30 + JDOM Project (http://www.jdom.org/)."
  31 + Alternatively, the acknowledgment may be graphical using the logos
  32 + available at http://www.jdom.org/images/logos.
  33 +
  34 + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  35 + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  36 + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  37 + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
  38 + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  41 + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  42 + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  43 + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  44 + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45 + SUCH DAMAGE.
  46 +
  47 + This software consists of voluntary contributions made by many
  48 + individuals on behalf of the JDOM Project and was originally
  49 + created by Jason Hunter <jhunter_AT_jdom_DOT_org> and
  50 + Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information
  51 + on the JDOM Project, please see <http://www.jdom.org/>.
  52 +
  53 + */
  54 +
  55 +package org.jdom2.input;
  56 +
  57 +import java.util.HashMap;
  58 +import java.util.regex.Matcher;
  59 +import java.util.regex.Pattern;
  60 +
  61 +import org.jdom2.DocType;
  62 +import org.jdom2.JDOMException;
  63 +import org.jdom2.JDOMFactory;
  64 +
  65 +/**
  66 + * In StAX Reader, the DocType is available as a single string.
  67 + * We need to harvest some data from it, as well as reformat it only to build
  68 + * a standardized DocType instance.
  69 + * <p>
  70 + * The assumption is that the DTD is valid.
  71 + * <p>
  72 + * We need to pull out 4 elements of data:
  73 + * <ol>
  74 + * <li>The root element name
  75 + * <li>The SystemID (if available)
  76 + * <li>The PublicID (if available)
  77 + * <li>The internal subset (if available)
  78 + * </ol>
  79 + *
  80 + * The internal-subset should be re-formatted to conform to the JDOM 'standard'
  81 + * where each declaration starts on a new line indented with 2 spaces. This
  82 + * 'standard' is defined by the way the JDOM formats the DTD declarations in the
  83 + * SAX parse process, which fires individual events for the content in the DTD.
  84 + * <p>
  85 + * We can do this all with a well-structured regular expression, which, is
  86 + * actually simpler than trying to fish out all the components ourselves....
  87 + * <p>
  88 + *
  89 + * @author Rolf Lear
  90 + *
  91 + */
  92 +public class DTDParser {
  93 +
  94 + /*
  95 + * =======================================================================
  96 + *
  97 + * READ THIS...
  98 + *
  99 + *
  100 + * This code works by using a reg-ex to parse a valid DTD document.
  101 + * The pattern is complicated (not as complicated as an actual parser).
  102 + *
  103 + * Because the pattern is complicated this code creates a pattern 'database'
  104 + * and then 'pulls' patterns from the database create the final regex. The
  105 + * database patterns are pulled to transform a pattern template in to a
  106 + * final regular expression. This template is called the 'meta-pattern'
  107 + *
  108 + * So, the pattern is not kept in it's final form, but rather it is built
  109 + * up at class initialization time based on the meta-pattern, and the
  110 + * pattern database in the map.
  111 + *
  112 + * This is the final pattern: (broken over a few lines)
  113 + *
  114 + * [\s\r\n\t]*<!DOCTYPE[\s\r\n\t]+([^\s\r\n\t\[>]+)([\s\r\n\t]+
  115 + * ((SYSTEM[\s\r\n\t]+(('([^']*)')|("([^"]*)")))|
  116 + * (PUBLIC[\s\r\n\t]+(('([^']*)')|("([^"]*)"))([\s\r\n\t]+
  117 + * (('([^']*)')|("([^"]*)")))?)))?([\s\r\n\t]*\[(.*)\])?
  118 + * [\s\r\n\t]*>[\s\r\n\t]*
  119 + *
  120 + * You will agree that it's simpler to built the pattern than read it....
  121 + *
  122 + * With the above in mind, you can easily follow the way the pattern is
  123 + * built as it is simply a repeating use of some of the base constructs.
  124 + * =======================================================================
  125 + */
  126 +
  127 + /**
  128 + * This is the meta-pattern.
  129 + * <p>
  130 + * <ul>
  131 + * <li>Where you see ' os ' there is optional space.
  132 + * <li>Where you see ' name ' there is the element name.
  133 + * <li>Where you see ' ms ' there is mandatory space.
  134 + * <li>Where you see ' id ' there is some quoted identifier.
  135 + * <li>Where you see ' internal ' there is the internal subset.
  136 + * </ul>
  137 + * Anything else will become part of the final regex.
  138 + * <p>
  139 + * Space ('&nbsp;') was chosen for the token delimiter because it
  140 + * makes the meta-pattern easy to read. There are a couple of places in
  141 + * this expression where there are two ' ' together, and it is critical
  142 + * that it does not change because there will be missed token matches then.
  143 + */
  144 + private static final String metapattern =
  145 + // The lead-in and the Element name
  146 + " os <!DOCTYPE ms ( name )" +
  147 + // The Public/System references, if any
  148 + "( ms ((SYSTEM ms id )|(PUBLIC ms id ( ms id )?)))?" +
  149 + // The Internal Subset, if any.
  150 + "( os \\[( internal )\\])?" +
  151 + // The lead-out.
  152 + " os > os ";
  153 +
  154 + /**
  155 + * This builds a substitution map containing the raw patterns for
  156 + * certain types of content we expect.
  157 + * @return The populated map.
  158 + */
  159 + private static final HashMap<String,String> populatePatterns() {
  160 + HashMap<String,String> p = new HashMap<String, String>();
  161 + // The name is important to understand. The assumption is that the
  162 + // doctype is valid, hence it is easier to search for what the name is
  163 + // not, and not what it is. The name will be terminated with either
  164 + // white-space, [ or >
  165 + p.put("name", "[^ \\n\\r\\t\\[>]+"); // element name.
  166 +
  167 + // whitespace: S ::= (#x20 | #x9 | #xD | #xA)+
  168 + p.put("ms", "[ \\n\\r\\t]+"); // mandatory whitespace.
  169 + p.put("os", "[ \\n\\r\\t]*"); // optional whitespace.
  170 +
  171 + // A quoted 'id'/"id" is anything except the quote
  172 + // we need to do parenthesis in this to get grouping to work.
  173 + // also need parenthesis to make the | or condition work
  174 + p.put("id", "(('([^']*)')|(\"([^\"]*)\"))"); // quoted id.
  175 +
  176 + // The internal subset is treated differently by the code, and the
  177 + // [ ] bracing around the internal subset is specified in the main regex
  178 + p.put("internal", ".*"); // internal subset.
  179 + return p;
  180 + }
  181 +
  182 + /**
  183 + * This method substitutes the simple tokens in the meta-pattern with
  184 + * the declared values in the map.
  185 + * @param map The map containing substitution tokens/patterns
  186 + * @param input The meta-pattern to do the substitutions on.
  187 + * @return The substituted pattern
  188 + */
  189 + private static final Pattern buildPattern(
  190 + HashMap<String,String> map, String input) {
  191 + // we are going to search for tokens. Each token is marked by a space.
  192 + // space was chosen because it makes the meta-pattern easy to read.
  193 + final Pattern search = Pattern.compile(" (\\w+) ");
  194 + final Matcher mat = search.matcher(input);
  195 + StringBuilder sb = new StringBuilder();
  196 + int pos = 0;
  197 + while (mat.find()) {
  198 + String rep = map.get(mat.group(1));
  199 +// we wrote this, it can't happen ;-). Live with a 'null' append.
  200 +// if (rep == null) {
  201 +// throw new IllegalArgumentException(
  202 +// "No definition of token '" + mat.group() + "'.");
  203 +// }
  204 + // can't use appendReplacement as we have to escape '\' chars.
  205 + // and Pattern.quote() does not help
  206 + // mat.appendReplacement(sb, rep);
  207 + sb.append(input.substring(pos, mat.start()));
  208 + sb.append(rep);
  209 + pos = mat.end();
  210 + }
  211 + sb.append(input.substring(pos));
  212 + return Pattern.compile(sb.toString(), Pattern.DOTALL);
  213 + }
  214 +
  215 + /**
  216 + * The following Pattern is the final result after
  217 + * parsing/tokenizing/substituting the meta-pattern.
  218 + */
  219 + private static final Pattern pattern =
  220 + buildPattern(populatePatterns(), metapattern);
  221 +
  222 + /*
  223 + * This pattern relies on pattern grouping to easily pull the values from
  224 + * the Matcher. Look at the following to get an idea of the groups that
  225 + * come from the reg-ex
  226 + *
  227 + * 0 -> <!DOCTYPE root SYSTEM "system" [internal] >
  228 + * 1 -> root
  229 + * 2 -> SYSTEM "system"
  230 + * 3 -> SYSTEM "system"
  231 + * 4 -> SYSTEM "system"
  232 + * 5 -> "system"
  233 + * 6 -> null
  234 + * 7 -> null
  235 + * 8 -> "system"
  236 + * 9 -> system
  237 + * 10 -> null
  238 + * 11 -> null
  239 + * 12 -> null
  240 + * 13 -> null
  241 + * 14 -> null
  242 + * 15 -> null
  243 + * 16 -> null
  244 + * 17 -> null
  245 + * 18 -> null
  246 + * 19 -> null
  247 + * 20 -> null
  248 + * 21 -> null
  249 + * 22 -> [internal]
  250 + * 23 -> internal
  251 + *
  252 + *
  253 + * 0 -> <!DOCTYPE root PUBLIC 'public' 'system' [internal] >
  254 + * 1 -> root
  255 + * 2 -> PUBLIC 'public' 'system'
  256 + * 3 -> PUBLIC 'public' 'system'
  257 + * 4 -> null
  258 + * 5 -> null
  259 + * 6 -> null
  260 + * 7 -> null
  261 + * 8 -> null
  262 + * 9 -> null
  263 + * 10 -> PUBLIC 'public' 'system'
  264 + * 11 -> 'public'
  265 + * 12 -> 'public'
  266 + * 13 -> public
  267 + * 14 -> null
  268 + * 15 -> null
  269 + * 16 -> 'system'
  270 + * 17 -> 'system'
  271 + * 18 -> 'system'
  272 + * 19 -> system
  273 + * 20 -> null
  274 + * 21 -> null
  275 + * 22 -> [internal]
  276 + * 23 -> internal
  277 + *
  278 + *
  279 + */
  280 +
  281 + /**
  282 + * Looks in any number of matched groups for a value. Returns the first set
  283 + * value. The assumption is that, depending on the pattern matches, the
  284 + * value could be in a few different locations.
  285 + * @param mat The match that has succeeded
  286 + * @param groups The groups to check for a value.
  287 + * @return The first found value.
  288 + */
  289 + private static final String getGroup(final Matcher mat, final int...groups) {
  290 + for (final int g : groups) {
  291 + final String s = mat.group(g);
  292 + if (s != null) {
  293 + return s;
  294 + }
  295 + }
  296 + return null;
  297 + }
  298 +
  299 + /**
  300 + * return true if the input character is one of the types recognized in the
  301 + * DTD spec.
  302 + * @param ch The char to check
  303 + * @return true if it is a space, tab, newline, or carriage-return.
  304 + */
  305 + private static final boolean isWhite(char ch) {
  306 + return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
  307 + }
  308 +
  309 + /**
  310 + * Reformat an internal subset.... Each declaration starts on an indented
  311 + * newline.
  312 + * @param internal the input DocType declaration as found in a StAX Reader.
  313 + * @return the formatted input.
  314 + */
  315 + private static String formatInternal(String internal) {
  316 + StringBuilder sb = new StringBuilder(internal.length());
  317 + char quote = ' ';
  318 + boolean white = true;
  319 + for (char ch : internal.toCharArray()) {
  320 + if (quote == ' ') {
  321 + // we are not in a quoted value...
  322 + if (isWhite(ch)) {
  323 + if (!white) {
  324 + // this will be the first whitespace.
  325 + // replace it with a single ' '
  326 + sb.append(' ');
  327 + white = true;
  328 + }
  329 + // subsequent (unquoted) whitespace is ignored
  330 + } else {
  331 + if (ch == '\'' || ch == '"') {
  332 + // we are entering a quoted value.
  333 + quote = ch;
  334 + } else if (ch == '<') {
  335 + // we are starting some form of declaration.
  336 + sb.append(" ");
  337 + }
  338 +
  339 + if (ch == '>') {
  340 + // we are ending a declaration.
  341 + if (white) {
  342 + // the declaration ended with whitespace, which we
  343 + // remove.
  344 + sb.setCharAt(sb.length() - 1, ch);
  345 + } else {
  346 + // the declaration had no whitespace at the end. OK
  347 + sb.append(ch);
  348 + }
  349 + // all declarations end with a new-line.
  350 + sb.append('\n');
  351 + // and subsequent lines start as trimmed whitespace.
  352 + white = true;
  353 + } else {
  354 + sb.append(ch);
  355 + white = false;
  356 + }
  357 + }
  358 + } else {
  359 + // we are in a quoted value...
  360 + if (ch == quote) {
  361 + //we are leaving the quoted value.
  362 + quote = ' ';
  363 + }
  364 + sb.append(ch);
  365 + }
  366 + }
  367 + return sb.toString();
  368 + }
  369 +
  370 + /**
  371 + * Parse out a DOCTYPE declaration as supplied by the standard StAX
  372 + * readers.
  373 + * <p>
  374 + * Using 'XML' terminology, this method assumes that the input is
  375 + * both 'well-formed' and 'valid'. The assumptions that this class makes
  376 + * ensure that the 'right thing' is done for valid content, but invalid
  377 + * content may or may not fail with a JDOMException. The behaviour of this
  378 + * method with invalid input is 'undefined'.
  379 + *
  380 + * @param input the input DOCTYPE string to parse. Must be valid.
  381 + * @param factory The JDOM factory to use to build the JDOM DocType.
  382 + * @return The input string as a DocType.
  383 + * @throws JDOMException if the DocType is not generated.
  384 + */
  385 + public static DocType parse(final String input, final JDOMFactory factory)
  386 + throws JDOMException {
  387 +
  388 + // Match the input to the DOCTYPE pattern matcher.
  389 + final Matcher mat = pattern.matcher(input);
  390 + if (!mat.matches()) {
  391 + throw new JDOMException("Doctype input does not appear to be valid: " + input);
  392 + }
  393 +
  394 + // Get the four data components.
  395 + final String docemt = mat.group(1);
  396 + final String sysid = getGroup(mat, 7, 9, 19, 21);
  397 + final String pubid = getGroup(mat, 13, 15);
  398 + final String internal = getGroup(mat, 23);
  399 +
  400 + // Use the appropriate constructor for the DocType.
  401 + DocType dt = null;
  402 + if (pubid != null) {
  403 + dt = factory.docType(docemt, pubid, sysid);
  404 + } else if (sysid != null) {
  405 + dt = factory.docType(docemt, sysid);
  406 + } else {
  407 + dt = factory.docType(docemt);
  408 + }
  409 + // Set the internal subset, if any.
  410 + if (internal != null) {
  411 + dt.setInternalSubset(formatInternal(internal));
  412 + }
  413 + return dt;
  414 + }
  415 +
  416 + /**
  417 + * Make instances 'impossible'. Everything is static.
  418 + */
  419 + private DTDParser() {
  420 + // nothing, you are not allowed instances of this class.
  421 + }
  422 +
  423 +}
439 core/src/java/org/jdom2/input/StAXBuilder.java
... ... @@ -0,0 +1,439 @@
  1 +/*--
  2 +
  3 + Copyright (C) 2000-2011 Jason Hunter & Brett McLaughlin.
  4 + All rights reserved.
  5 +
  6 + Redistribution and use in source and binary forms, with or without
  7 + modification, are permitted provided that the following conditions
  8 + are met:
  9 +
  10 + 1. Redistributions of source code must retain the above copyright
  11 + notice, this list of conditions, and the following disclaimer.
  12 +
  13 + 2. Redistributions in binary form must reproduce the above copyright
  14 + notice, this list of conditions, and the disclaimer that follows
  15 + these conditions in the documentation and/or other materials
  16 + provided with the distribution.
  17 +
  18 + 3. The name "JDOM" must not be used to endorse or promote products
  19 + derived from this software without prior written permission. For
  20 + written permission, please contact <request_AT_jdom_DOT_org>.
  21 +
  22 + 4. Products derived from this software may not be called "JDOM", nor
  23 + may "JDOM" appear in their name, without prior written permission
  24 + from the JDOM Project Management <request_AT_jdom_DOT_org>.
  25 +
  26 + In addition, we request (but do not require) that you include in the
  27 + end-user documentation provided with the redistribution and/or in the
  28 + software itself an acknowledgement equivalent to the following:
  29 + "This product includes software developed by the
  30 + JDOM Project (http://www.jdom.org/)."
  31 + Alternatively, the acknowledgment may be graphical using the logos
  32 + available at http://www.jdom.org/images/logos.
  33 +
  34 + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  35 + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  36 + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  37 + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
  38 + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  41 + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  42 + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  43 + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  44 + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45 + SUCH DAMAGE.
  46 +
  47 + This software consists of voluntary contributions made by many
  48 + individuals on behalf of the JDOM Project and was originally
  49 + created by Jason Hunter <jhunter_AT_jdom_DOT_org> and
  50 + Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information
  51 + on the JDOM Project, please see <http://www.jdom.org/>.
  52 +
  53 + */
  54 +
  55 +package org.jdom2.input;
  56 +
  57 +import java.util.Iterator;
  58 +
  59 +import javax.xml.namespace.QName;
  60 +import javax.xml.stream.XMLEventReader;
  61 +import javax.xml.stream.XMLStreamConstants;
  62 +import javax.xml.stream.XMLStreamException;
  63 +import javax.xml.stream.XMLStreamReader;
  64 +import javax.xml.stream.events.Characters;
  65 +import javax.xml.stream.events.StartElement;
  66 +import javax.xml.stream.events.XMLEvent;
  67 +
  68 +import org.jdom2.AttributeType;
  69 +import org.jdom2.Comment;
  70 +import org.jdom2.DefaultJDOMFactory;
  71 +import org.jdom2.DocType;
  72 +import org.jdom2.Document;
  73 +import org.jdom2.Element;
  74 +import org.jdom2.JDOMException;
  75 +import org.jdom2.JDOMFactory;
  76 +import org.jdom2.Namespace;
  77 +import org.jdom2.ProcessingInstruction;
  78 +
  79 +/**
  80 + * Builds a JDOM document from a StAX-based XMLStremReader.
  81 + * <p>
  82 + * XMLStreamReaders are pre-configured and as a result JDOM is not able to
  83 + * alter whether the input is validated, or whether the Stream has escaped
  84 + * entities or not. These (and other) characteristics are configurable by
  85 + * setting the correct features and properties on the XMLInputFactory when it
  86 + * is used to create the XMLStreamReader.
  87 + * <p>
  88 + * Useful configuration to set, or know about is:
  89 + * <ul>
  90 + * <li>StAX streams seldom differentiate between Text and CDATA content. You
  91 + * will likely want to configure your StAX factory (XMLInputFactory) with
  92 + * <code>http://java.sun.com/xml/stream/properties/report-cdata-event</code>
  93 + * for the default Java StAX implementation, or the equivalent property for your
  94 + * StAX engine.
  95 + * <li>The remaining XMLInputFactory settings are likely to work fine at their
  96 + * default values.
  97 + * <li>StAX is not likely to be your best option if you want a validating
  98 + * parser, at least not with the default (built-in Java implementation in Java6
  99 + * which does not support it). Consider a SAX parser.
  100 + * </ul>
  101 + * <p>
  102 + * From a JDOM perspective XMLStreamReaders are more efficient than
  103 + * XMLEventReaders. Where possible use an XMLStreamReader.
  104 + * <p>
  105 + * If you happen to be looking at the source code, pay careful attention to the
  106 + * imports so you know what type of instance is being processed, whether it is
  107 + * a StAX class, or a JDOM class, because there are name conflicts.
  108 + *
  109 + * @author Rolf Lear
  110 + *
  111 + */
  112 +public class StAXBuilder implements XMLStreamConstants {
  113 +
  114 + /**
  115 + * Create a Document from an XMLStreamReader
  116 + * @param factory The {@link JDOMFactory} to use
  117 + * @param stream The XMLStreamReader to read from
  118 + * @return the parsed Document
  119 + * @throws JDOMException if there is any issue
  120 + * (XMLStreamExceptions are wrapped).
  121 + */
  122 + private static final Document process(final JDOMFactory factory,
  123 + final XMLStreamReader stream) throws JDOMException {
  124 + try {
  125 +
  126 + final Document document = factory.document(null);
  127 +
  128 + Element current = null;
  129 +
  130 + int state = stream.getEventType();
  131 +
  132 + if (XMLStreamConstants.START_DOCUMENT != state) {
  133 + throw new JDOMException("JDOM requires that XMLStreamReaders " +
  134 + "are at their beginning when being processed.");
  135 + }
  136 +
  137 +
  138 + while (state != XMLStreamConstants.END_DOCUMENT) {
  139 + switch (state) {
  140 +
  141 + case START_DOCUMENT:
  142 + // for the <?xml version="..." standalone=".."?>
  143 + document.setBaseURI(stream.getLocation().getSystemId());
  144 + document.setProperty("ENCODING_SCHEME",
  145 + stream.getCharacterEncodingScheme());
  146 + document.setProperty("STANDALONE",
  147 + String.valueOf(stream.isStandalone()));
  148 + document.setProperty("ENCODING",
  149 + stream.getEncoding());
  150 + break;
  151 +
  152 + case DTD:
  153 + final DocType dtype = DTDParser.parse(
  154 + stream.getText(), factory);
  155 + document.setDocType(dtype);
  156 + break;
  157 +
  158 + case START_ELEMENT:
  159 + final Element emt = processElement(factory, stream);
  160 + if (current == null) {
  161 + document.setRootElement(emt);
  162 + final DocType dt = document.getDocType();
  163 + if (dt != null) {
  164 + dt.setElementName(emt.getName());
  165 + }
  166 + } else {
  167 + current.addContent(emt);
  168 + }
  169 + current = emt;
  170 + break;
  171 +
  172 + case END_ELEMENT:
  173 + current = current.getParentElement();
  174 + break;
  175 +
  176 + case CDATA:
  177 + if (current != null) {
  178 + current.addContent(factory.cdata(stream.getText()));
  179 + }
  180 + break;
  181 +
  182 + case SPACE:
  183 + case CHARACTERS:
  184 + if (current != null) {
  185 + current.addContent(factory.text(stream.getText()));
  186 + }
  187 + break;
  188 +
  189 + case COMMENT:
  190 + if (current == null) {
  191 + document.addContent(
  192 + factory.comment(stream.getText()));
  193 + } else {
  194 + current.addContent(
  195 + factory.comment(stream.getText()));
  196 + }
  197 + break;
  198 +
  199 + case ENTITY_REFERENCE:
  200 + if (current != null) {
  201 + current.addContent(
  202 + factory.entityRef(stream.getLocalName()));
  203 + }
  204 + break;
  205 +
  206 + case PROCESSING_INSTRUCTION:
  207 + if (current == null) {
  208 + document.addContent(factory.processingInstruction(
  209 + stream.getPITarget(), stream.getPIData()));
  210 + } else {
  211 + current.addContent(factory.processingInstruction(
  212 + stream.getPITarget(), stream.getPIData()));
  213 + }
  214 + break;
  215 +
  216 + default:
  217 + throw new JDOMException("Unexpected XMLStream event " + state);
  218 +
  219 + }
  220 + if (stream.hasNext()) {
  221 + state = stream.next();
  222 + } else {
  223 + throw new JDOMException("Unexpected end-of-XMLStreamReader");
  224 + }
  225 + }
  226 + return document;
  227 + } catch (final XMLStreamException xse) {
  228 + throw new JDOMException("Unable to process XMLStream. See Cause.", xse);
  229 + }
  230 + }
  231 +
  232 + private static final Element processElement(final JDOMFactory factory,
  233 + final XMLStreamReader reader) {
  234 +
  235 + final Element element = factory.element(reader.getLocalName(),
  236 + Namespace.getNamespace(reader.getPrefix(),
  237 + reader.getNamespaceURI()));
  238 +
  239 + // Handle attributes
  240 + for (int i=0, len=reader.getAttributeCount(); i<len; i++) {
  241 + factory.setAttribute(element, factory.attribute(
  242 + reader.getAttributeLocalName(i),
  243 + reader.getAttributeValue(i),
  244 + AttributeType.getAttributeType(reader.getAttributeType(i)),
  245 + Namespace.getNamespace(reader.getAttributePrefix(i),
  246 + reader.getAttributeNamespace(i))));
  247 + }
  248 +
  249 + // Handle Namespaces
  250 + for (int i = 0, len = reader.getNamespaceCount(); i < len; i++) {
  251 + element.addNamespaceDeclaration(Namespace.getNamespace(
  252 + reader.getNamespacePrefix(i), reader.getNamespaceURI(i)));
  253 + }
  254 +
  255 + return element;
  256 + }
  257 +
  258 +
  259 + /**
  260 + * Create a Document from an XMLEventReader
  261 + * @param factory the {@link JDOMFactory} to use
  262 + * @param stream the XMLEventReader to read from
  263 + * @return the parsed Document
  264 + * @throws JDOMException if there is any issue
  265 + * (XMLStreamExceptions are wrapped).
  266 + */
  267 + private static final Document process(final JDOMFactory factory,
  268 + final XMLEventReader events) throws JDOMException {
  269 + try {
  270 +
  271 + final Document document = factory.document(null);
  272 + Element current = null;
  273 +
  274 + XMLEvent event = events.peek();
  275 +
  276 + if (XMLStreamConstants.START_DOCUMENT != event.getEventType()) {
  277 + throw new JDOMException("JDOM requires that XMLStreamReaders " +
  278 + "are at their beginning when being processed.");
  279 + }
  280 +
  281 +
  282 +
  283 + while (event.getEventType() != XMLStreamConstants.END_DOCUMENT) {
  284 + if (event.isStartDocument()) {
  285 + document.setBaseURI(event.getLocation().getSystemId());
  286 + document.setProperty("ENCODING_SCHEME",
  287 + ((javax.xml.stream.events.StartDocument)event).getCharacterEncodingScheme());
  288 + document.setProperty("STANDALONE", String.valueOf(
  289 + ((javax.xml.stream.events.StartDocument)event).isStandalone()));
  290 + // document.setProperty("ENCODING",
  291 + // ((StartDocument)event).getEncoding());
  292 + } else if (event instanceof javax.xml.stream.events.DTD) {
  293 + //List<?> list = (List<?>)reader.getProperty("javax.xml.stream.entities");
  294 + //System.out.println(list);
  295 + final DocType dtype = DTDParser.parse(((javax.xml.stream.events.DTD)event).getDocumentTypeDeclaration(), factory);
  296 + document.setDocType(dtype);
  297 + } else if (event.isStartElement()) {
  298 + final Element emt = processElement(factory, event.asStartElement());
  299 + if (current == null) {
  300 + document.setRootElement(emt);
  301 + final DocType dt = document.getDocType();
  302 + if (dt != null) {
  303 + dt.setElementName(emt.getName());
  304 + }
  305 + } else {
  306 + current.addContent(emt);
  307 + }
  308 + current = emt;
  309 + } else if (event.isCharacters()) {
  310 + final Characters chars = event.asCharacters();
  311 + if (chars.isCData()) {
  312 + current.addContent(factory.cdata(
  313 + ((Characters)event).getData()));
  314 + } else {
  315 + current.addContent(factory.text(
  316 + ((Characters)event).getData()));
  317 + }
  318 + } else if (event instanceof javax.xml.stream.events.Comment) {
  319 + final Comment comment = factory.comment(
  320 + ((javax.xml.stream.events.Comment)event).getText());
  321 + if (current == null) {
  322 + document.addContent(comment);
  323 + } else {
  324 + current.addContent(comment);
  325 + }
  326 + } else if (event.isEntityReference()) {
  327 + current.addContent(factory.entityRef(
  328 + ((javax.xml.stream.events.EntityReference)event).getName()));
  329 + } else if (event.isProcessingInstruction()) {
  330 + final ProcessingInstruction pi = factory.processingInstruction(
  331 + ((javax.xml.stream.events.ProcessingInstruction)event).getTarget(),
  332 + ((javax.xml.stream.events.ProcessingInstruction)event).getData());
  333 + if (current == null) {
  334 + document.addContent(pi);
  335 + } else {
  336 + current.addContent(pi);
  337 + }
  338 + } else if (event.isEndElement()) {
  339 + current = current.getParentElement();
  340 + }
  341 + if (events.hasNext()) {
  342 + event = events.nextEvent();
  343 + } else {
  344 + break;
  345 + }
  346 + }
  347 + return document;
  348 + } catch (final XMLStreamException xse) {
  349 + throw new JDOMException("Unable to process XMLStream. See Cause.", xse);
  350 + }
  351 + }
  352 +
  353 + private static final Element processElement(final JDOMFactory factory,
  354 + final StartElement event) {
  355 + final QName qname = event.getName();
  356 +
  357 + final Element element = factory.element(qname.getLocalPart(),
  358 + Namespace.getNamespace(qname.getPrefix(), qname.getNamespaceURI()));
  359 +
  360 + // Handle attributes
  361 + for (final Iterator<?> it = event.getAttributes();
  362 + it.hasNext(); ) {
  363 +
  364 + final javax.xml.stream.events.Attribute att =
  365 + (javax.xml.stream.events.Attribute)it.next();
  366 +
  367 + final QName aqname = att.getName();
  368 +
  369 + final Namespace attNs = Namespace.getNamespace(aqname.getPrefix(),
  370 + aqname.getNamespaceURI());
  371 +
  372 + factory.setAttribute(element, factory.attribute(
  373 + aqname.getLocalPart(), att.getValue(),
  374 + AttributeType.getAttributeType(att.getDTDType()), attNs));
  375 + }
  376 +
  377 + for (final Iterator<?> it = event.getNamespaces(); it.hasNext();) {
  378 + final javax.xml.stream.events.Namespace ns =
  379 + (javax.xml.stream.events.Namespace)it.next();
  380 +
  381 + element.addNamespaceDeclaration(Namespace.getNamespace(
  382 + ns.getPrefix(), ns.getNamespaceURI()));
  383 + }
  384 +
  385 + return element;
  386 + }
  387 +
  388 +
  389 +
  390 + /** The factory to use for parsing */
  391 + private JDOMFactory factory = new DefaultJDOMFactory();
  392 +
  393 + /**
  394 + * Returns the current {@link org.jdom2.JDOMFactory} in use.
  395 + * @return the factory in use
  396 + */
  397 + public JDOMFactory getFactory() {
  398 + return factory;
  399 + }
  400 +
  401 + /**
  402 + * This sets a custom JDOMFactory for the builder. Use this to build
  403 + * the tree with your own subclasses of the JDOM classes.
  404 + *
  405 + * @param factory <code>JDOMFactory</code> to use
  406 + */
  407 + public void setFactory(JDOMFactory factory) {
  408 + this.factory = factory;
  409 + }
  410 +
  411 + /**
  412 + * This builds a document from the supplied
  413 + * XMLStreamReader.
  414 + * <p>
  415 + * The JDOMContent will be built by the current JDOMFactory.
  416 + *
  417 + * @param reader <code>XMLStreamReader</code> to read from
  418 + * @return <code>Document</code> resultant Document object
  419 + * @throws JDOMException when errors occur in parsing
  420 + */
  421 + public Document build(XMLStreamReader reader) throws JDOMException {
  422 + return process(factory, reader);
  423 + }
  424 +
  425 + /**
  426 + * This builds a document from the supplied
  427 + * XMLEventReader.
  428 + * <p>
  429 + * The JDOMContent will be built by the current JDOMFactory.
  430 + *
  431 + * @param events <code>XMLEventReader</code> to read from
  432 + * @return <code>Document</code> resultant Document object
  433 + * @throws JDOMException when errors occur in parsing
  434 + */
  435 + public Document build(XMLEventReader events) throws JDOMException {
  436 + return process(factory, events);
  437 + }
  438 +
  439 +}
10 test/resources/DOMBuilder/complex.xml
... ... @@ -1,6 +1,8 @@
1 1 <?xml version="1.0" encoding="UTF-8"?>
2 2 <!-- root comment -->
  3 +
3 4 <?jdomtest root level ?>
  5 +
4 6 <root att1="val1" att2="val2" >
5 7 text
6 8 <child att="child1" xml:space="preserve"> hello Frodo Baggins! </child>
@@ -10,4 +12,10 @@
10 12 <!-- comment -->
11 13 <child att="child4" unresolved="&amp;"/>
12 14 <child att="child5" > <![CDATA[some cdata text ]]> </child>
13   -</root>
  15 + <child att="child6" >
  16 + <leaf att="Leaf6" />
  17 + </child>
  18 +</root>
  19 +
  20 +
  21 +
336 test/src/java/org/jdom2/test/cases/input/TestDTDParser.java
... ... @@ -0,0 +1,336 @@
  1 +package org.jdom2.test.cases.input;
  2 +
  3 +import static org.junit.Assert.*;
  4 +
  5 +import org.jdom2.DocType;
  6 +import org.jdom2.JDOMException;
  7 +import org.jdom2.JDOMFactory;
  8 +import org.jdom2.DefaultJDOMFactory;
  9 +import org.jdom2.input.DTDParser;
  10 +import org.jdom2.test.util.UnitTestUtil;
  11 +import org.junit.Test;
  12 +
  13 +@SuppressWarnings("javadoc")
  14 +public class TestDTDParser {
  15 +
  16 + private static final JDOMFactory factory = new DefaultJDOMFactory();
  17 +
  18 + @Test
  19 + public void testParseSimple() throws JDOMException {
  20 + DocType dt = DTDParser.parse(
  21 + "<!DOCTYPE root >",
  22 + factory);
  23 +
  24 + assertEquals("root", dt.getElementName());
  25 + assertEquals(null, dt.getPublicID());
  26 + assertEquals(null, dt.getSystemID());
  27 + assertEquals(null, dt.getInternalSubset());
  28 + }
  29 +
  30 + @Test
  31 + public void testParseSimpleCompact() throws JDOMException {
  32 + DocType dt = DTDParser.parse(
  33 + "<!DOCTYPE root>",
  34 + factory);
  35 +
  36 + assertEquals("root", dt.getElementName());
  37 + assertEquals(null, dt.getPublicID());
  38 + assertEquals(null, dt.getSystemID());
  39 + assertEquals(null, dt.getInternalSubset());
  40 + }
  41 +
  42 + @Test
  43 + public void testParseSimpleCompactInternal() throws JDOMException {
  44 + DocType dt = DTDParser.parse(
  45 + "<!DOCTYPE root[internal]>",
  46 + factory);
  47 +
  48 + assertEquals("root", dt.getElementName());
  49 + assertEquals(null, dt.getPublicID());
  50 + assertEquals(null, dt.getSystemID());
  51 + assertEquals("internal", dt.getInternalSubset());
  52 + }
  53 +
  54 + @Test
  55 + public void testParseSYSTEMquotNONE() throws JDOMException {
  56 + DocType dt = DTDParser.parse(
  57 + "<!DOCTYPE root SYSTEM \"system\" >",
  58 + factory);
  59 +
  60 + assertEquals("root", dt.getElementName());
  61 + assertEquals(null, dt.getPublicID());
  62 + assertEquals("system", dt.getSystemID());
  63 + assertEquals(null, dt.getInternalSubset());
  64 + }
  65 +
  66 + @Test
  67 + public void testParseSYSTEMaposNONE() throws JDOMException {
  68 + DocType dt = DTDParser.parse(
  69 + "<!DOCTYPE root SYSTEM 'system' >",
  70 + factory);
  71 +
  72 + assertEquals("root", dt.getElementName());
  73 + assertEquals(null, dt.getPublicID());
  74 + assertEquals("system", dt.getSystemID());
  75 + assertEquals(null, dt.getInternalSubset());
  76 + }
  77 +
  78 + @Test
  79 + public void testParseSYSTEMquotSimple() throws JDOMException {
  80 + DocType dt = DTDParser.parse(
  81 + "<!DOCTYPE root SYSTEM \"system\" [internal] >",
  82 + factory);
  83 +
  84 + assertEquals("root", dt.getElementName());
  85 + assertEquals(null, dt.getPublicID());
  86 + assertEquals("system", dt.getSystemID());
  87 + assertEquals("internal", dt.getInternalSubset());
  88 + }
  89 +
  90 + @Test
  91 + public void testParseSYSTEMaposSimple() throws JDOMException {
  92 + DocType dt = DTDParser.parse(
  93 + "<!DOCTYPE root SYSTEM 'system' [internal] >",
  94 + factory);
  95 +
  96 + assertEquals("root", dt.getElementName());
  97 + assertEquals(null, dt.getPublicID());
  98 + assertEquals("system", dt.getSystemID());
  99 + assertEquals("internal", dt.getInternalSubset());
  100 + }
  101 +
  102 + @Test
  103 + public void testParsePUBLICquotenullNONE() throws JDOMException {
  104 + DocType dt = DTDParser.parse(
  105 + "<!DOCTYPE root PUBLIC \"public\" >",
  106 + factory);
  107 +
  108 + assertEquals("root", dt.getElementName());
  109 + assertEquals("public", dt.getPublicID());
  110 + assertEquals(null, dt.getSystemID());
  111 + assertEquals(null, dt.getInternalSubset());
  112 + }
  113 +
  114 + @Test
  115 + public void testParsePUBLICaposnullNONE() throws JDOMException {
  116 + DocType dt = DTDParser.parse(
  117 + "<!DOCTYPE root PUBLIC 'public' >",
  118 + factory);
  119 +
  120 + assertEquals("root", dt.getElementName());
  121 + assertEquals("public", dt.getPublicID());
  122 + assertEquals(null, dt.getSystemID());
  123 + assertEquals(null, dt.getInternalSubset());
  124 + }
  125 +
  126 + @Test
  127 + public void testParsePUBLICquotquotNONE() throws JDOMException {
  128 + DocType dt = DTDParser.parse(
  129 + "<!DOCTYPE root PUBLIC \"public\" \"system\" >",
  130 + factory);
  131 +
  132 + assertEquals("root", dt.getElementName());
  133 + assertEquals("public", dt.getPublicID());
  134 + assertEquals("system", dt.getSystemID());
  135 + assertEquals(null, dt.getInternalSubset());
  136 + }
  137 +
  138 + @Test
  139 + public void testParsePUBLICquotaposNONE() throws JDOMException {
  140 + DocType dt = DTDParser.parse(
  141 + "<!DOCTYPE root PUBLIC \"public\" 'system' >",
  142 + factory);
  143 +
  144 + assertEquals("root", dt.getElementName());
  145 + assertEquals("public", dt.getPublicID());
  146 + assertEquals("system", dt.getSystemID());
  147 + assertEquals(null, dt.getInternalSubset());
  148 + }
  149 +
  150 + @Test
  151 + public void testParsePUBLICaposquotNONE() throws JDOMException {
  152 + DocType dt = DTDParser.parse(
  153 + "<!DOCTYPE root PUBLIC 'public' \"system\" >",
  154 + factory);
  155 +
  156 + assertEquals("root", dt.getElementName());
  157 + assertEquals("public", dt.getPublicID());
  158 + assertEquals("system", dt.getSystemID());
  159 + assertEquals(null, dt.getInternalSubset());
  160 + }
  161 +
  162 + @Test
  163 + public void testParsePUBLICaposaposNONE() throws JDOMException {
  164 + DocType dt = DTDParser.parse(
  165 + "<!DOCTYPE root PUBLIC 'public' 'system' >",
  166 + factory);
  167 +
  168 + assertEquals("root", dt.getElementName());
  169 + assertEquals("public", dt.getPublicID());
  170 + assertEquals("system", dt.getSystemID());
  171 + assertEquals(null, dt.getInternalSubset());
  172 + }
  173 +
  174 + @Test
  175 + public void testParsePUBLICaposaposSimple() throws JDOMException {
  176 + DocType dt = DTDParser.parse(
  177 + "<!DOCTYPE root PUBLIC 'public' 'system' [internal] >",
  178 + factory);
  179 +
  180 + assertEquals("root", dt.getElementName());
  181 + assertEquals("public", dt.getPublicID());
  182 + assertEquals("system", dt.getSystemID());
  183 + assertEquals("internal", dt.getInternalSubset());
  184 + }
  185 +
  186 + @Test
  187 + public void testParsePUBLICaposaposSimpleCompact() throws JDOMException {
  188 + DocType dt = DTDParser.parse(
  189 + "<!DOCTYPE root PUBLIC 'public' 'system'[internal]>",
  190 + factory);
  191 +
  192 + assertEquals("root", dt.getElementName());
  193 + assertEquals("public", dt.getPublicID());
  194 + assertEquals("system", dt.getSystemID());
  195 + assertEquals("internal", dt.getInternalSubset());
  196 + }
  197 +
  198 + @Test
  199 + public void testParsePUBLICaposaposSimpleSpacy() throws JDOMException {
  200 + DocType dt = DTDParser.parse(
  201 + " <!DOCTYPE root PUBLIC ' public ' ' system ' [ <!ENTITY " +
  202 + " ent\n EntityDef > ] > ",
  203 + factory);
  204 +
  205 + assertEquals("root", dt.getElementName());
  206 + assertEquals(" public ", dt.getPublicID());
  207 + assertEquals(" system ", dt.getSystemID());
  208 + assertEquals(" <!ENTITY ent EntityDef>\n", dt.getInternalSubset());
  209 + }
  210 +
  211 + @Test
  212 + public void testParseInternalA() throws JDOMException {
  213 + DocType dt = DTDParser.parse(
  214 + "<!DOCTYPE root [<!ELEMENT root (#PCDATA)><!ENTITY xpd 'Expand Me!' >]>",
  215 + factory);
  216 +
  217 + assertEquals("root", dt.getElementName());
  218 + assertEquals(null, dt.getPublicID());
  219 + assertEquals(null, dt.getSystemID());
  220 + assertEquals(" <!ELEMENT root (#PCDATA)>\n <!ENTITY xpd 'Expand Me!'>\n", dt.getInternalSubset());
  221 + }
  222 +
  223 + @Test