Skip to content

Commit

Permalink
Added NodeIterator and Stream support
Browse files Browse the repository at this point in the history
Improvement: added the NodeIterator class, to efficiently traverse a node tree using the Iterator interface.

And added Stream Element#stream() and Node#nodeStream() methods, to enable fluent composable stream pipelines of node traversals.

NodeIterator only hits nodes once (vs head and tail for NodeTraversor), is restartable, supports modifications of the node it just emitted (e.g. replace, remove), supports type filtering, and emits in document order.

Refactored most head-only uses of NodeTraversor to use the NodeIterator or a Stream backed by it.
  • Loading branch information
jhy committed Nov 15, 2023
1 parent c46870c commit 85fe7e3
Show file tree
Hide file tree
Showing 15 changed files with 626 additions and 76 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Release 1.17.1 [PENDING]
`#replaceAll(operator)`. These methods update the original DOM, as well as the Elements list.
<https://github.com/jhy/jsoup/pull/2017>

* Improvement: added the NodeIterator class, to efficiently traverse a node tree using the Iterator interface. And
added Stream Element#stream() and Node#nodeStream() methods, to enable fluent composable stream pipelines of node
traversals.

* Improvement: when changing the OutputSettings syntax to XML, the xhtml EscapeMode is automatically set by default.

* Improvement: added the `:is(selector list)` pseudo-selector, which finds elements that match any of the selectors in
Expand Down
20 changes: 20 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@
<ignore>java.io.UncheckedIOException</ignore>
<ignore>java.util.function.Predicate</ignore>
<ignore>java.util.function.UnaryOperator</ignore>
<ignore>java.util.stream.Stream</ignore>
<ignore>java.util.stream.StreamSupport</ignore>
<ignore>java.util.Spliterator</ignore>
<ignore>java.util.Spliterators</ignore>
<ignore>java.util.Optional</ignore>
<ignore>java.util.stream.Collector</ignore>
<ignore>java.util.stream.Collectors</ignore>

<ignore>java.net.HttpURLConnection</ignore><!-- .setAuthenticator(java.net.Authenticator) in Java 9; only used in multirelease 9+ version -->
</ignores>
<!-- ^ Provided by https://developer.android.com/studio/write/java8-support#library-desugaring
Expand Down Expand Up @@ -246,6 +254,18 @@
<binaryCompatible>true</binaryCompatible>
<sourceCompatible>true</sourceCompatible>
</overrideCompatibilityChangeParameter>

<!--
One off, getting a spurious ping on adding [<T extends Node> Stream<T> nodeStream(Class<T> class)] to Node.
Manually verified binary & source compatibility
todo: remove after 1.17.1 release
-->
<overrideCompatibilityChangeParameter>
<compatibilityChange>CLASS_GENERIC_TEMPLATE_CHANGED</compatibilityChange>
<binaryCompatible>true</binaryCompatible>
<sourceCompatible>true</sourceCompatible>
</overrideCompatibilityChangeParameter>

</overrideCompatibilityChangeParameters>
</parameter>
</configuration>
Expand Down
43 changes: 32 additions & 11 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Stream;

import static org.jsoup.internal.Normalizer.normalize;
import static org.jsoup.nodes.TextNode.lastCharIsWhitespace;
Expand Down Expand Up @@ -378,6 +379,15 @@ void nodelistChanged() {
shadowChildrenRef = null;
}

/**
Returns a Stream of this Element and all of its descendant Elements. The stream has document order.
@return a stream of this element and its descendants.
@see #nodeStream()
*/
public Stream<Element> stream() {
return NodeUtils.stream(this, Element.class);
}

/**
* Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated.
* <p>
Expand Down Expand Up @@ -454,7 +464,6 @@ public Elements select(Evaluator evaluator) {
return Selector.select(evaluator, this);
}


/**
* Find the first Element that matches the {@link Selector} CSS query, with this element as the starting context.
* <p>This is effectively the same as calling {@code element.select(query).first()}, but is more efficient as query
Expand Down Expand Up @@ -697,7 +706,7 @@ public Element insertChildren(int index, Node... children) {
}

/**
* Create a new element by tag name, and add it as the last child.
* Create a new element by tag name, and add it as this Element's last child.
*
* @param tagName the name of the tag (e.g. {@code div}).
* @return the new element, to allow you to add content to it, e.g.:
Expand All @@ -707,14 +716,21 @@ public Element appendElement(String tagName) {
return appendElement(tagName, tag.namespace());
}

/**
* Create a new element by tag name and namespace, add it as this Element's last child.
*
* @param tagName the name of the tag (e.g. {@code div}).
* @param namespace the namespace of the tag (e.g. {@link Parser#NamespaceHtml})
* @return the new element, in the specified namespace
*/
public Element appendElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
appendChild(child);
return child;
}

/**
* Create a new element by tag name, and add it as the first child.
* Create a new element by tag name, and add it as this Element's first child.
*
* @param tagName the name of the tag (e.g. {@code div}).
* @return the new element, to allow you to add content to it, e.g.:
Expand All @@ -724,6 +740,13 @@ public Element prependElement(String tagName) {
return prependElement(tagName, tag.namespace());
}

/**
* Create a new element by tag name and namespace, and add it as this Element's first child.
*
* @param tagName the name of the tag (e.g. {@code div}).
* @param namespace the namespace of the tag (e.g. {@link Parser#NamespaceHtml})
* @return the new element, in the specified namespace
*/
public Element prependElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
prependChild(child);
Expand Down Expand Up @@ -1389,7 +1412,7 @@ public void tail(Node node, int depth) {
*/
public String wholeText() {
final StringBuilder accum = StringUtil.borrowBuilder();
NodeTraversor.traverse((node, depth) -> appendWholeText(node, accum), this);
nodeStream().forEach(node -> appendWholeText(node, accum));
return StringUtil.releaseBuilder(accum);
}

Expand All @@ -1402,7 +1425,7 @@ private static void appendWholeText(Node node, StringBuilder accum) {
}

/**
Get the non-normalized, decoded text of this element, <b>not including</b> any child elements, including only any
Get the non-normalized, decoded text of this element, <b>not including</b> any child elements, including any
newlines and spaces present in the original source.
@return decoded, non-normalized text that is a direct child of this Element
@see #text()
Expand Down Expand Up @@ -1849,17 +1872,15 @@ public Element forEachNode(Consumer<? super Node> action) {
@param action the function to perform on the element
@return this Element, for chaining
@see Node#forEachNode(Consumer)
@deprecated use {@link #stream()}.{@link Stream#forEach(Consumer) forEach(Consumer)} instead. (Removing this method
so Element can implement Iterable, which this signature conflicts with due to the non-void return.)
*/
@Deprecated
public Element forEach(Consumer<? super Element> action) {
Validate.notNull(action);
NodeTraversor.traverse((node, depth) -> {
if (node instanceof Element)
action.accept((Element) node);
}, this);
stream().forEach(action);
return this;
}


@Override
public Element filter(NodeFilter nodeFilter) {
return (Element) super.filter(nodeFilter);
Expand Down
27 changes: 24 additions & 3 deletions src/main/java/org/jsoup/nodes/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.function.Consumer;
import java.util.stream.Stream;

/**
The base, abstract Node model. Elements, Documents, Comments etc are all Node instances.
Expand Down Expand Up @@ -225,7 +226,8 @@ public String absUrl(String attributeKey) {
/**
Get a child node by its 0-based index.
@param index index of child node
@return the child node at this index. Throws a {@code IndexOutOfBoundsException} if the index is out of bounds.
@return the child node at this index.
@throws IndexOutOfBoundsException if the index is out of bounds.
*/
public Node childNode(int index) {
return ensureChildNodes().get(index);
Expand Down Expand Up @@ -682,12 +684,12 @@ public Node traverse(NodeVisitor nodeVisitor) {
*/
public Node forEachNode(Consumer<? super Node> action) {
Validate.notNull(action);
NodeTraversor.traverse((node, depth) -> action.accept(node), this);
nodeStream().forEach(action);
return this;
}

/**
* Perform a depth-first filtering through this node and its descendants.
* Perform a depth-first filtered traversal through this node and its descendants.
* @param nodeFilter the filter callbacks to perform on each node
* @return this node, for chaining
*/
Expand All @@ -697,6 +699,25 @@ public Node filter(NodeFilter nodeFilter) {
return this;
}

/**
Returns a Stream of this Node and all of its descendant Nodes. The stream has document order.
@return a stream of all nodes.
@see Element#stream()
*/
public Stream<Node> nodeStream() {
return NodeUtils.stream(this, Node.class);
}

/**
Returns a Stream of this and descendant nodes, containing only nodes of the specified type. The stream has document
order.
@return a stream of nodes filtered by type.
@see Element#stream()
*/
public <T extends Node> Stream<T> nodeStream(Class<T> type) {
return NodeUtils.stream(this, type);
}

/**
Get the outer HTML of this node. For example, on a {@code p} element, may return {@code <p>Para</p>}.
@return outer HTML
Expand Down
123 changes: 123 additions & 0 deletions src/main/java/org/jsoup/nodes/NodeIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package org.jsoup.nodes;

import org.jsoup.helper.Validate;
import org.jspecify.annotations.Nullable;

import java.util.Iterator;
import java.util.NoSuchElementException;

/**
Iterate through a Node and its tree of descendants, in document order, and returns nodes of the specified type. This
iterator supports structural changes to the tree during the traversal, such as {@link Node#remove()},
{@link Node#replaceWith(Node)}, {@link Node#wrap(String)}, etc.
<p>See also the {@link org.jsoup.select.NodeTraversor NodeTraversor} if {@code head} and {@code tail} callbacks are
desired for each node.</p>
*/
public class NodeIterator<T extends Node> implements Iterator<T> {
private Node root; // root / starting node
private @Nullable T next; // the next node to return
private Node current; // the current (last emitted) node
private Node previous; // the previously emitted node; used to recover from structural changes
private @Nullable Node currentParent; // the current node's parent; used to detect structural changes
private final Class<T> type; // the desired node class type

/**
Create a NoteIterator that will iterate the supplied node, and all of its descendants. The returned {@link #next}
type will be filtered to the input type.
* @param start initial node
* @param type node type to filter for
*/
public NodeIterator(Node start, Class<T> type) {
Validate.notNull(start);
Validate.notNull(type);
this.type = type;

restart(start);
}

/**
Create a NoteIterator that will iterate the supplied node, and all of its descendants. All node types will be
returned.
* @param start initial node
*/
public static NodeIterator<Node> from(Node start) {
return new NodeIterator<>(start, Node.class);
}

/**
Restart this Iterator from the specified start node. Will act as if it were newly constructed. Useful for e.g. to
save some GC if the iterator is used in a tight loop.
* @param start the new start node.
*/
public void restart(Node start) {
if (type.isInstance(start))
//noinspection unchecked
next = (T) start; // first next() will be the start node

root = previous = current = start;
currentParent = current.parent();
}

@Override public boolean hasNext() {
maybeFindNext();
return next != null;
}

@Override public T next() {
maybeFindNext();
if (next == null) throw new NoSuchElementException();

T result = next;
previous = current;
current = next;
currentParent = current.parent();
next = null;
return result;
}

/**
If next is not null, looks for and sets next. If next is null after this, we have reached the end.
*/
private void maybeFindNext() {
if (next != null) return;

// change detected (removed or replaced), redo from previous
if (currentParent != null && !current.hasParent())
current = previous;

next = findNextNode();
}

private @Nullable T findNextNode() {
Node node = current;
while (true) {
if (node.childNodeSize() > 0)
node = node.childNode(0); // descend children
else if (root.equals(node))
node = null; // complete when all children of root are fully visited
else if (node.nextSibling() != null)
node = node.nextSibling(); // in a descendant with no more children; traverse
else {
while (true) {
node = node.parent(); // pop out of descendants
if (node == null || root.equals(node))
return null; // got back to root; complete
if (node.nextSibling() != null) {
node = node.nextSibling(); // traverse
break;
}
}
}
if (node == null)
return null; // reached the end

if (type.isInstance(node))
//noinspection unchecked
return (T) node;
}
}

@Override public void remove() {
current.remove();
}
}
19 changes: 19 additions & 0 deletions src/main/java/org/jsoup/nodes/NodeUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
import org.jsoup.parser.Parser;
import org.w3c.dom.NodeList;

import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
* Internal helpers for Nodes, to keep the actual node APIs relatively clean. A jsoup internal class, so don't use it as
Expand Down Expand Up @@ -47,4 +52,18 @@ static <T extends Node> List<T> selectXpath(String xpath, Element el, Class<T> n
NodeList nodeList = w3c.selectXpath(xpath, contextNode);
return w3c.sourceNodes(nodeList, nodeType);
}

/** Creates a Stream, starting with the supplied node. */
static <T extends Node> Stream<T> stream(Node start, Class<T> type) {
NodeIterator<T> iterator = new NodeIterator<>(start, type);
Spliterator<T> spliterator = spliterator(iterator);

return StreamSupport.stream(spliterator, false);
}

static <T extends Node> Spliterator<T> spliterator(Iterator<T> iterator) {
return Spliterators.spliteratorUnknownSize(
iterator,
Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED);
}
}
Loading

0 comments on commit 85fe7e3

Please sign in to comment.