From 85fe7e3e3019fa0619bec293ee3ae7ea00c9458c Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 15 Nov 2023 11:29:51 +1100 Subject: [PATCH] Added NodeIterator and Stream support Improvement: added the NodeIterator class, to efficiently traverse a node tree using the Iterator interface. And added Stream Element#stream() and Node#nodeStream() methods, to enable fluent composable stream pipelines of node traversals. NodeIterator only hits nodes once (vs head and tail for NodeTraversor), is restartable, supports modifications of the node it just emitted (e.g. replace, remove), supports type filtering, and emits in document order. Refactored most head-only uses of NodeTraversor to use the NodeIterator or a Stream backed by it. --- CHANGES | 4 + pom.xml | 20 ++ src/main/java/org/jsoup/nodes/Element.java | 43 ++- src/main/java/org/jsoup/nodes/Node.java | 27 +- .../java/org/jsoup/nodes/NodeIterator.java | 123 ++++++++ src/main/java/org/jsoup/nodes/NodeUtils.java | 19 ++ src/main/java/org/jsoup/select/Collector.java | 49 +--- src/main/java/org/jsoup/select/Evaluator.java | 22 ++ .../java/org/jsoup/select/NodeTraversor.java | 4 - .../org/jsoup/select/StructuralEvaluator.java | 20 +- .../java/org/jsoup/nodes/ElementTest.java | 12 + .../org/jsoup/nodes/NodeIteratorTest.java | 266 ++++++++++++++++++ .../java/org/jsoup/nodes/NodeStreamTest.java | 70 +++++ .../java/org/jsoup/nodes/PositionTest.java | 9 +- .../java/org/jsoup/select/TraversorTest.java | 14 +- 15 files changed, 626 insertions(+), 76 deletions(-) create mode 100644 src/main/java/org/jsoup/nodes/NodeIterator.java create mode 100644 src/test/java/org/jsoup/nodes/NodeIteratorTest.java create mode 100644 src/test/java/org/jsoup/nodes/NodeStreamTest.java diff --git a/CHANGES b/CHANGES index 1ac7cb333b..6e73e340ce 100644 --- a/CHANGES +++ b/CHANGES @@ -10,6 +10,10 @@ Release 1.17.1 [PENDING] `#replaceAll(operator)`. These methods update the original DOM, as well as the Elements list. + * Improvement: added the NodeIterator class, to efficiently traverse a node tree using the Iterator interface. And + added Stream Element#stream() and Node#nodeStream() methods, to enable fluent composable stream pipelines of node + traversals. + * Improvement: when changing the OutputSettings syntax to XML, the xhtml EscapeMode is automatically set by default. * Improvement: added the `:is(selector list)` pseudo-selector, which finds elements that match any of the selectors in diff --git a/pom.xml b/pom.xml index f6581e5ce0..1c9b53b202 100644 --- a/pom.xml +++ b/pom.xml @@ -94,6 +94,14 @@ java.io.UncheckedIOException java.util.function.Predicate java.util.function.UnaryOperator + java.util.stream.Stream + java.util.stream.StreamSupport + java.util.Spliterator + java.util.Spliterators + java.util.Optional + java.util.stream.Collector + java.util.stream.Collectors + java.net.HttpURLConnection + + CLASS_GENERIC_TEMPLATE_CHANGED + true + true + + diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 1b49168693..f009e8a588 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -31,6 +31,7 @@ import java.util.function.Consumer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import java.util.stream.Stream; import static org.jsoup.internal.Normalizer.normalize; import static org.jsoup.nodes.TextNode.lastCharIsWhitespace; @@ -378,6 +379,15 @@ void nodelistChanged() { shadowChildrenRef = null; } + /** + Returns a Stream of this Element and all of its descendant Elements. The stream has document order. + @return a stream of this element and its descendants. + @see #nodeStream() + */ + public Stream stream() { + return NodeUtils.stream(this, Element.class); + } + /** * Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated. *

@@ -454,7 +464,6 @@ public Elements select(Evaluator evaluator) { return Selector.select(evaluator, this); } - /** * Find the first Element that matches the {@link Selector} CSS query, with this element as the starting context. *

This is effectively the same as calling {@code element.select(query).first()}, but is more efficient as query @@ -697,7 +706,7 @@ public Element insertChildren(int index, Node... children) { } /** - * Create a new element by tag name, and add it as the last child. + * Create a new element by tag name, and add it as this Element's last child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: @@ -707,6 +716,13 @@ public Element appendElement(String tagName) { return appendElement(tagName, tag.namespace()); } + /** + * Create a new element by tag name and namespace, add it as this Element's last child. + * + * @param tagName the name of the tag (e.g. {@code div}). + * @param namespace the namespace of the tag (e.g. {@link Parser#NamespaceHtml}) + * @return the new element, in the specified namespace + */ public Element appendElement(String tagName, String namespace) { Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri()); appendChild(child); @@ -714,7 +730,7 @@ public Element appendElement(String tagName, String namespace) { } /** - * Create a new element by tag name, and add it as the first child. + * Create a new element by tag name, and add it as this Element's first child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: @@ -724,6 +740,13 @@ public Element prependElement(String tagName) { return prependElement(tagName, tag.namespace()); } + /** + * Create a new element by tag name and namespace, and add it as this Element's first child. + * + * @param tagName the name of the tag (e.g. {@code div}). + * @param namespace the namespace of the tag (e.g. {@link Parser#NamespaceHtml}) + * @return the new element, in the specified namespace + */ public Element prependElement(String tagName, String namespace) { Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri()); prependChild(child); @@ -1389,7 +1412,7 @@ public void tail(Node node, int depth) { */ public String wholeText() { final StringBuilder accum = StringUtil.borrowBuilder(); - NodeTraversor.traverse((node, depth) -> appendWholeText(node, accum), this); + nodeStream().forEach(node -> appendWholeText(node, accum)); return StringUtil.releaseBuilder(accum); } @@ -1402,7 +1425,7 @@ private static void appendWholeText(Node node, StringBuilder accum) { } /** - Get the non-normalized, decoded text of this element, not including any child elements, including only any + Get the non-normalized, decoded text of this element, not including any child elements, including any newlines and spaces present in the original source. @return decoded, non-normalized text that is a direct child of this Element @see #text() @@ -1849,17 +1872,15 @@ public Element forEachNode(Consumer action) { @param action the function to perform on the element @return this Element, for chaining @see Node#forEachNode(Consumer) + @deprecated use {@link #stream()}.{@link Stream#forEach(Consumer) forEach(Consumer)} instead. (Removing this method + so Element can implement Iterable, which this signature conflicts with due to the non-void return.) */ + @Deprecated public Element forEach(Consumer action) { - Validate.notNull(action); - NodeTraversor.traverse((node, depth) -> { - if (node instanceof Element) - action.accept((Element) node); - }, this); + stream().forEach(action); return this; } - @Override public Element filter(NodeFilter nodeFilter) { return (Element) super.filter(nodeFilter); diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java index b0f08409b9..c31817da74 100644 --- a/src/main/java/org/jsoup/nodes/Node.java +++ b/src/main/java/org/jsoup/nodes/Node.java @@ -16,6 +16,7 @@ import java.util.LinkedList; import java.util.List; import java.util.function.Consumer; +import java.util.stream.Stream; /** The base, abstract Node model. Elements, Documents, Comments etc are all Node instances. @@ -225,7 +226,8 @@ public String absUrl(String attributeKey) { /** Get a child node by its 0-based index. @param index index of child node - @return the child node at this index. Throws a {@code IndexOutOfBoundsException} if the index is out of bounds. + @return the child node at this index. + @throws IndexOutOfBoundsException if the index is out of bounds. */ public Node childNode(int index) { return ensureChildNodes().get(index); @@ -682,12 +684,12 @@ public Node traverse(NodeVisitor nodeVisitor) { */ public Node forEachNode(Consumer action) { Validate.notNull(action); - NodeTraversor.traverse((node, depth) -> action.accept(node), this); + nodeStream().forEach(action); return this; } /** - * Perform a depth-first filtering through this node and its descendants. + * Perform a depth-first filtered traversal through this node and its descendants. * @param nodeFilter the filter callbacks to perform on each node * @return this node, for chaining */ @@ -697,6 +699,25 @@ public Node filter(NodeFilter nodeFilter) { return this; } + /** + Returns a Stream of this Node and all of its descendant Nodes. The stream has document order. + @return a stream of all nodes. + @see Element#stream() + */ + public Stream nodeStream() { + return NodeUtils.stream(this, Node.class); + } + + /** + Returns a Stream of this and descendant nodes, containing only nodes of the specified type. The stream has document + order. + @return a stream of nodes filtered by type. + @see Element#stream() + */ + public Stream nodeStream(Class type) { + return NodeUtils.stream(this, type); + } + /** Get the outer HTML of this node. For example, on a {@code p} element, may return {@code

Para

}. @return outer HTML diff --git a/src/main/java/org/jsoup/nodes/NodeIterator.java b/src/main/java/org/jsoup/nodes/NodeIterator.java new file mode 100644 index 0000000000..ffe6754be3 --- /dev/null +++ b/src/main/java/org/jsoup/nodes/NodeIterator.java @@ -0,0 +1,123 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; +import org.jspecify.annotations.Nullable; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + Iterate through a Node and its tree of descendants, in document order, and returns nodes of the specified type. This + iterator supports structural changes to the tree during the traversal, such as {@link Node#remove()}, + {@link Node#replaceWith(Node)}, {@link Node#wrap(String)}, etc. +

See also the {@link org.jsoup.select.NodeTraversor NodeTraversor} if {@code head} and {@code tail} callbacks are + desired for each node.

+ */ +public class NodeIterator implements Iterator { + private Node root; // root / starting node + private @Nullable T next; // the next node to return + private Node current; // the current (last emitted) node + private Node previous; // the previously emitted node; used to recover from structural changes + private @Nullable Node currentParent; // the current node's parent; used to detect structural changes + private final Class type; // the desired node class type + + /** + Create a NoteIterator that will iterate the supplied node, and all of its descendants. The returned {@link #next} + type will be filtered to the input type. + * @param start initial node + * @param type node type to filter for + */ + public NodeIterator(Node start, Class type) { + Validate.notNull(start); + Validate.notNull(type); + this.type = type; + + restart(start); + } + + /** + Create a NoteIterator that will iterate the supplied node, and all of its descendants. All node types will be + returned. + * @param start initial node + */ + public static NodeIterator from(Node start) { + return new NodeIterator<>(start, Node.class); + } + + /** + Restart this Iterator from the specified start node. Will act as if it were newly constructed. Useful for e.g. to + save some GC if the iterator is used in a tight loop. + * @param start the new start node. + */ + public void restart(Node start) { + if (type.isInstance(start)) + //noinspection unchecked + next = (T) start; // first next() will be the start node + + root = previous = current = start; + currentParent = current.parent(); + } + + @Override public boolean hasNext() { + maybeFindNext(); + return next != null; + } + + @Override public T next() { + maybeFindNext(); + if (next == null) throw new NoSuchElementException(); + + T result = next; + previous = current; + current = next; + currentParent = current.parent(); + next = null; + return result; + } + + /** + If next is not null, looks for and sets next. If next is null after this, we have reached the end. + */ + private void maybeFindNext() { + if (next != null) return; + + // change detected (removed or replaced), redo from previous + if (currentParent != null && !current.hasParent()) + current = previous; + + next = findNextNode(); + } + + private @Nullable T findNextNode() { + Node node = current; + while (true) { + if (node.childNodeSize() > 0) + node = node.childNode(0); // descend children + else if (root.equals(node)) + node = null; // complete when all children of root are fully visited + else if (node.nextSibling() != null) + node = node.nextSibling(); // in a descendant with no more children; traverse + else { + while (true) { + node = node.parent(); // pop out of descendants + if (node == null || root.equals(node)) + return null; // got back to root; complete + if (node.nextSibling() != null) { + node = node.nextSibling(); // traverse + break; + } + } + } + if (node == null) + return null; // reached the end + + if (type.isInstance(node)) + //noinspection unchecked + return (T) node; + } + } + + @Override public void remove() { + current.remove(); + } +} diff --git a/src/main/java/org/jsoup/nodes/NodeUtils.java b/src/main/java/org/jsoup/nodes/NodeUtils.java index 4bd7e019ac..6d3bb814ab 100644 --- a/src/main/java/org/jsoup/nodes/NodeUtils.java +++ b/src/main/java/org/jsoup/nodes/NodeUtils.java @@ -6,7 +6,12 @@ import org.jsoup.parser.Parser; import org.w3c.dom.NodeList; +import java.util.Iterator; import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; /** * Internal helpers for Nodes, to keep the actual node APIs relatively clean. A jsoup internal class, so don't use it as @@ -47,4 +52,18 @@ static List selectXpath(String xpath, Element el, Class n NodeList nodeList = w3c.selectXpath(xpath, contextNode); return w3c.sourceNodes(nodeList, nodeType); } + + /** Creates a Stream, starting with the supplied node. */ + static Stream stream(Node start, Class type) { + NodeIterator iterator = new NodeIterator<>(start, type); + Spliterator spliterator = spliterator(iterator); + + return StreamSupport.stream(spliterator, false); + } + + static Spliterator spliterator(Iterator iterator) { + return Spliterators.spliteratorUnknownSize( + iterator, + Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED); + } } diff --git a/src/main/java/org/jsoup/select/Collector.java b/src/main/java/org/jsoup/select/Collector.java index 948e3371b3..ce095553c8 100644 --- a/src/main/java/org/jsoup/select/Collector.java +++ b/src/main/java/org/jsoup/select/Collector.java @@ -4,6 +4,10 @@ import org.jsoup.nodes.Node; import org.jspecify.annotations.Nullable; +import java.util.Optional; +import java.util.function.Predicate; +import java.util.stream.Collectors; + import static org.jsoup.select.NodeFilter.FilterResult.CONTINUE; import static org.jsoup.select.NodeFilter.FilterResult.STOP; @@ -24,15 +28,10 @@ private Collector() {} */ public static Elements collect (Evaluator eval, Element root) { eval.reset(); - Elements elements = new Elements(); - NodeTraversor.traverse((node, depth) -> { - if (node instanceof Element) { - Element el = (Element) node; - if (eval.matches(root, el)) - elements.add(el); - } - }, root); - return elements; + + return root.stream() + .filter(eval.asPredicate(root)) + .collect(Collectors.toCollection(Elements::new)); } /** @@ -44,36 +43,8 @@ public static Elements collect (Evaluator eval, Element root) { */ public static @Nullable Element findFirst(Evaluator eval, Element root) { eval.reset(); - FirstFinder finder = new FirstFinder(eval); - return finder.find(root, root); - } - - static class FirstFinder implements NodeFilter { - private @Nullable Element evalRoot = null; - private @Nullable Element match = null; - private final Evaluator eval; - - FirstFinder(Evaluator eval) { - this.eval = eval; - } - - @Nullable Element find(Element root, Element start) { - evalRoot = root; - match = null; - NodeTraversor.filter(this, start); - return match; - } - @Override - public FilterResult head(Node node, int depth) { - if (node instanceof Element) { - Element el = (Element) node; - if (eval.matches(evalRoot, el)) { - match = el; - return STOP; - } - } - return CONTINUE; - } + Optional first = root.stream().filter(eval.asPredicate(root)).findFirst(); + return first.orElse(null); } } diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java index 65040bcf64..6f95b7e4fd 100644 --- a/src/main/java/org/jsoup/select/Evaluator.java +++ b/src/main/java/org/jsoup/select/Evaluator.java @@ -12,6 +12,7 @@ import org.jsoup.parser.ParseSettings; import java.util.List; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -27,6 +28,27 @@ public abstract class Evaluator { protected Evaluator() { } + /** + Provides a Predicate for this Evaluator, matching the test Element + * @param root the root Element, for match evaluation + * @return a predicate that accepts an Element to test for matches with this Evaluator + */ + public Predicate asPredicate(Element root) { + return new MatchPredicate(root); + } + + class MatchPredicate implements Predicate { + final Element root; + + public MatchPredicate(Element root) { + this.root = root; + } + + @Override public boolean test(Element element) { + return matches(root, element); + } + } + /** * Test if the element meets the evaluator's requirements. * diff --git a/src/main/java/org/jsoup/select/NodeTraversor.java b/src/main/java/org/jsoup/select/NodeTraversor.java index 4e549e87de..de8be092e9 100644 --- a/src/main/java/org/jsoup/select/NodeTraversor.java +++ b/src/main/java/org/jsoup/select/NodeTraversor.java @@ -4,10 +4,6 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeFilter.FilterResult; -import org.jspecify.annotations.Nullable; - -import java.util.Iterator; -import java.util.NoSuchElementException; /** A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document diff --git a/src/main/java/org/jsoup/select/StructuralEvaluator.java b/src/main/java/org/jsoup/select/StructuralEvaluator.java index c0087c4e05..f19cd781b1 100644 --- a/src/main/java/org/jsoup/select/StructuralEvaluator.java +++ b/src/main/java/org/jsoup/select/StructuralEvaluator.java @@ -2,7 +2,7 @@ import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; +import org.jsoup.nodes.NodeIterator; import java.util.ArrayList; import java.util.IdentityHashMap; @@ -59,23 +59,21 @@ public boolean matches(Element root, Element element) { } static class Has extends StructuralEvaluator { - final Collector.FirstFinder finder; + final NodeIterator it = new NodeIterator<>(new Element("html"), Element.class); + // the element here is just a placeholder so this can be final - gets set in restart() public Has(Evaluator evaluator) { super(evaluator); - finder = new Collector.FirstFinder(evaluator); } - @Override public boolean matches(Element root, Element element) { // for :has, we only want to match children (or below), not the input element. And we want to minimize GCs - for (int i = 0; i < element.childNodeSize(); i++) { - Node node = element.childNode(i); - if (node instanceof Element) { - Element match = finder.find(element, (Element) node); - if (match != null) - return true; - } + it.restart(element); + while (it.hasNext()) { + Element el = it.next(); + if (el == element) continue; // don't match self, only descendants + if (evaluator.matches(element, el)) + return true; } return false; } diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index 923d13dd2d..12a51a653f 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -208,6 +208,18 @@ public void testWholeText() { assertEquals("Hello \n there", doc.wholeText()); } + @Test void wholeTextRuns() { + Document doc = Jsoup.parse("

.

"); + + Element p1 = doc.expectFirst("#1"); + Element p2 = doc.expectFirst("#2"); + Element p3 = doc.expectFirst("#3"); + + assertEquals("", p1.wholeText()); + assertEquals(" ", p2.wholeText()); + assertEquals(". ", p3.wholeText()); + } + @Test public void testGetSiblings() { Document doc = Jsoup.parse("

Hello

there

this

is

an

element

"); diff --git a/src/test/java/org/jsoup/nodes/NodeIteratorTest.java b/src/test/java/org/jsoup/nodes/NodeIteratorTest.java new file mode 100644 index 0000000000..ab7e9345e5 --- /dev/null +++ b/src/test/java/org/jsoup/nodes/NodeIteratorTest.java @@ -0,0 +1,266 @@ +package org.jsoup.nodes; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import java.util.NoSuchElementException; + +import static org.junit.jupiter.api.Assertions.*; + +class NodeIteratorTest { + String html = "

One

Two

Three

Four

"; + + @Test void canIterateNodes() { + Document doc = Jsoup.parse(html); + NodeIterator it = NodeIterator.from(doc); + assertIterates(it, "#root;html;head;body;div#1;p;One;p;Two;div#2;p;Three;p;Four;"); + // todo - need to review that the Document object #root holds the html element as child. Why not have document root == html element? + assertFalse(it.hasNext()); + + boolean threw = false; + try { + it.next(); + } catch (NoSuchElementException e) { + threw = true; + } + assertTrue(threw); + } + + @Test void hasNextIsPure() { + Document doc = Jsoup.parse(html); + NodeIterator it = NodeIterator.from(doc); + assertTrue(it.hasNext()); + assertTrue(it.hasNext()); + assertIterates(it, "#root;html;head;body;div#1;p;One;p;Two;div#2;p;Three;p;Four;"); + assertFalse(it.hasNext()); + } + + @Test void iterateSubTree() { + Document doc = Jsoup.parse(html); + + Element div1 = doc.expectFirst("div#1"); + NodeIterator it = NodeIterator.from(div1); + assertIterates(it, "div#1;p;One;p;Two;"); + assertFalse(it.hasNext()); + + Element div2 = doc.expectFirst("div#2"); + NodeIterator it2 = NodeIterator.from(div2); + assertIterates(it2, "div#2;p;Three;p;Four;"); + assertFalse(it2.hasNext()); + } + + @Test void canRestart() { + Document doc = Jsoup.parse(html); + + NodeIterator it = NodeIterator.from(doc); + assertIterates(it, "#root;html;head;body;div#1;p;One;p;Two;div#2;p;Three;p;Four;"); + + it.restart(doc.expectFirst("div#2")); + assertIterates(it, "div#2;p;Three;p;Four;"); + } + + @Test void canIterateJustOneSibling() { + Document doc = Jsoup.parse(html); + Element p2 = doc.expectFirst("p:contains(Two)"); + assertEquals("Two", p2.text()); + + NodeIterator it = NodeIterator.from(p2); + assertIterates(it, "p;Two;"); + + NodeIterator elIt = new NodeIterator<>(p2, Element.class); + Element found = elIt.next(); + assertSame(p2, found); + assertFalse(elIt.hasNext()); + } + + @Test void canIterateFirstEmptySibling() { + Document doc = Jsoup.parse("

.

..

"); + Element p1 = doc.expectFirst("p#1"); + assertEquals("", p1.ownText()); + + NodeIterator it = NodeIterator.from(p1); + assertTrue(it.hasNext()); + Node node = it.next(); + assertSame(p1, node); + assertFalse(it.hasNext()); + } + + @Test void canRemoveViaIterator() { + String html = "

One

Two

Three

Four

Out2"; + Document doc = Jsoup.parse(html); + + NodeIterator it = NodeIterator.from(doc); + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + if (node.attr("id").equals("1")) + it.remove(); + trackSeen(node, seen); + } + assertEquals("#root;html;head;body;div#out1;div#1;div#2;p;Three;p;Four;div#out2;Out2;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#out1;div#2;p;Three;p;Four;div#out2;Out2;"); + + it = NodeIterator.from(doc); + seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + if (node.attr("id").equals("2")) + it.remove(); + trackSeen(node, seen); + } + assertEquals("#root;html;head;body;div#out1;div#2;div#out2;Out2;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#out1;div#out2;Out2;"); + } + + @Test void canRemoveViaNode() { + String html = "

One

Two

Three

Four

Out2"; + Document doc = Jsoup.parse(html); + + NodeIterator it = NodeIterator.from(doc); + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + if (node.attr("id").equals("1")) + node.remove(); + trackSeen(node, seen); + } + assertEquals("#root;html;head;body;div#out1;div#1;div#2;p;Three;p;Four;div#out2;Out2;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#out1;div#2;p;Three;p;Four;div#out2;Out2;"); + + it = NodeIterator.from(doc); + seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + if (node.attr("id").equals("2")) + node.remove(); + trackSeen(node, seen); + } + assertEquals("#root;html;head;body;div#out1;div#2;div#out2;Out2;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#out1;div#out2;Out2;"); + } + + @Test void canReplace() { + String html = "

One

Two

Three

Four

Out2"; + Document doc = Jsoup.parse(html); + + NodeIterator it = NodeIterator.from(doc); + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + trackSeen(node, seen); + if (node.attr("id").equals("1")) { + node.replaceWith(new Element("span").text("Foo")); + } + } + assertEquals("#root;html;head;body;div#out1;div#1;span;Foo;div#2;p;Three;p;Four;div#out2;Out2;", seen.toString()); + // ^^ we don't see

One, do see the replaced in , and the subsequent nodes + assertContents(doc, "#root;html;head;body;div#out1;span;Foo;div#2;p;Three;p;Four;div#out2;Out2;"); + + it = NodeIterator.from(doc); + seen = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + trackSeen(node, seen); + if (node.attr("id").equals("2")) { + node.replaceWith(new Element("span").text("Bar")); + } + } + assertEquals("#root;html;head;body;div#out1;span;Foo;div#2;span;Bar;div#out2;Out2;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#out1;span;Foo;span;Bar;div#out2;Out2;"); + } + + @Test void canWrap() { + Document doc = Jsoup.parse(html); + NodeIterator it = NodeIterator.from(doc); + boolean sawInner = false; + while (it.hasNext()) { + Node node = it.next(); + if (node.attr("id").equals("1")) { + node.wrap("

"); + } + if (node instanceof TextNode && ((TextNode) node).text().equals("One")) + sawInner = true; + } + assertContents(doc, "#root;html;head;body;div#outer;div#1;p;One;p;Two;div#2;p;Three;p;Four;"); + assertTrue(sawInner); + } + + @Test void canFilterForElements() { + Document doc = Jsoup.parse(html); + NodeIterator it = new NodeIterator<>(doc, Element.class); + + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + Element el = it.next(); + assertNotNull(el); + trackSeen(el, seen); + } + + assertEquals("#root;html;head;body;div#1;p;p;div#2;p;p;", seen.toString()); + } + + @Test void canFilterForTextNodes() { + Document doc = Jsoup.parse(html); + NodeIterator it = new NodeIterator<>(doc, TextNode.class); + + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + TextNode text = it.next(); + assertNotNull(text); + trackSeen(text, seen); + } + + assertEquals("One;Two;Three;Four;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#1;p;One;p;Two;div#2;p;Three;p;Four;"); + } + + @Test void canModifyFilteredElements() { + Document doc = Jsoup.parse(html); + NodeIterator it = new NodeIterator<>(doc, Element.class); + + StringBuilder seen = new StringBuilder(); + while (it.hasNext()) { + Element el = it.next(); + if (!el.ownText().isEmpty()) + el.text(el.ownText() + "++"); + trackSeen(el, seen); + } + + assertEquals("#root;html;head;body;div#1;p;p;div#2;p;p;", seen.toString()); + assertContents(doc, "#root;html;head;body;div#1;p;One++;p;Two++;div#2;p;Three++;p;Four++;"); + } + + static void assertIterates(NodeIterator it, String expected) { + Node previous = null; + StringBuilder actual = new StringBuilder(); + while (it.hasNext()) { + Node node = it.next(); + assertNotNull(node); + assertNotSame(previous, node); + + trackSeen(node, actual); + previous = node; + } + assertEquals(expected, actual.toString()); + } + + static void assertContents(Element el, String expected) { + NodeIterator it = NodeIterator.from(el); + assertIterates(it, expected); + } + + static void trackSeen(Node node, StringBuilder actual) { + if (node instanceof Element) { + Element el = (Element) node; + actual.append(el.tagName()); + if (el.hasAttr("id")) + actual.append("#").append(el.id()); + } + else if (node instanceof TextNode) + actual.append(((TextNode) node).text()); + else + actual.append(node.nodeName()); + actual.append(";"); + } + +} \ No newline at end of file diff --git a/src/test/java/org/jsoup/nodes/NodeStreamTest.java b/src/test/java/org/jsoup/nodes/NodeStreamTest.java new file mode 100644 index 0000000000..b8aadaf83f --- /dev/null +++ b/src/test/java/org/jsoup/nodes/NodeStreamTest.java @@ -0,0 +1,70 @@ +package org.jsoup.nodes; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import java.util.Optional; +import java.util.stream.Stream; + +import static org.jsoup.nodes.NodeIteratorTest.trackSeen; +import static org.jsoup.nodes.NodeIteratorTest.assertContents; +import static org.junit.jupiter.api.Assertions.*; + +public class NodeStreamTest { + + String html = "

One

Two

Three

Four

"; + + + @Test void canStream() { + Document doc = Jsoup.parse(html); + StringBuilder seen = new StringBuilder(); + Stream stream = doc.nodeStream(); + stream.forEachOrdered(node -> trackSeen(node, seen)); + assertEquals("#root;html;head;body;div#1;p;One;p;Two;div#2;p;Three;p;Four;", seen.toString()); + } + + @Test void canStreamParallel() { + Document doc = Jsoup.parse(html); + long count = doc.nodeStream().parallel().count(); + assertEquals(14, count); + } + + @Test void canFindFirst() { + Document doc = Jsoup.parse(html); + Optional first = doc.nodeStream().findFirst(); + assertTrue(first.isPresent()); + assertSame(doc, first.get()); + } + + @Test void canFilter() { + Document doc = Jsoup.parse(html); + StringBuilder seen = new StringBuilder(); + + doc.nodeStream() + .filter(node -> node instanceof TextNode) + .forEach(node -> trackSeen(node, seen)); + + assertEquals("One;Two;Three;Four;", seen.toString()); + } + + @Test void canRemove() { + String html = "

One

Two

Three

Four

Five"; + Document doc = Jsoup.parse(html); + + doc.nodeStream() + .filter(node -> node instanceof Element) + .filter(node -> node.attr("id").equals("1") || node.attr("id").equals("2")) + .forEach(Node::remove); + + assertContents(doc, "#root;html;head;body;div#3;p;Five;"); + } + + @Test void elementStream() { + Document doc = Jsoup.parse(html); + StringBuilder seen = new StringBuilder(); + Stream stream = doc.stream(); + stream.forEachOrdered(node -> trackSeen(node, seen)); + assertEquals("#root;html;head;body;div#1;p;p;div#2;p;p;", seen.toString()); + } + +} diff --git a/src/test/java/org/jsoup/nodes/PositionTest.java b/src/test/java/org/jsoup/nodes/PositionTest.java index 1db73c9e97..eff86e0455 100644 --- a/src/test/java/org/jsoup/nodes/PositionTest.java +++ b/src/test/java/org/jsoup/nodes/PositionTest.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.*; @@ -177,12 +178,8 @@ class PositionTest { String html = "foobarquxcoo
baz
"; Document doc = Jsoup.parse(html, TrackingParser); - List textNodes = new ArrayList<>(); - NodeTraversor.traverse((Node node, int depth) -> { - if (node instanceof TextNode) { - textNodes.add((TextNode) node); - } - }, doc); + List textNodes = doc.nodeStream(TextNode.class) + .collect(Collectors.toList()); assertEquals(5, textNodes.size()); assertEquals("1,8:7-1,11:10", textNodes.get(0).sourceRange().toString()); diff --git a/src/test/java/org/jsoup/select/TraversorTest.java b/src/test/java/org/jsoup/select/TraversorTest.java index 07f93b0565..2b1da28137 100644 --- a/src/test/java/org/jsoup/select/TraversorTest.java +++ b/src/test/java/org/jsoup/select/TraversorTest.java @@ -8,10 +8,10 @@ import org.jsoup.nodes.TextNode; import org.junit.jupiter.api.Test; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.*; public class TraversorTest { // Note: NodeTraversor.traverse(new NodeVisitor) is tested in @@ -210,4 +210,14 @@ else if (node instanceof TextNode && ((TextNode) node).text().equals("Three")) assertEquals(8, seenCount.get()); // body and contents assertEquals(3, deepest.get()); } + + @Test void seesDocRoot() { + Document doc = Jsoup.parse("

One"); + AtomicBoolean seen = new AtomicBoolean(false); + doc.traverse((node, depth) -> { + if (node.equals(doc)) + seen.set(true); + }); + assertTrue(seen.get()); + } }