From d09a1069790832e53171bb7d1c8a18f8ddcbfeab Mon Sep 17 00:00:00 2001 From: Benoit Tellier Date: Fri, 1 Jul 2022 11:16:36 +0700 Subject: [PATCH] ISSUE-1773 Tokeniser: recycle string builders too --- .../jsoup/helper/StringBuilderRecycler.java | 23 ++++++++++++++++ src/main/java/org/jsoup/parser/Tokeniser.java | 27 +++++++++++++++++-- .../java/org/jsoup/parser/TreeBuilder.java | 1 + 3 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 src/main/java/org/jsoup/helper/StringBuilderRecycler.java diff --git a/src/main/java/org/jsoup/helper/StringBuilderRecycler.java b/src/main/java/org/jsoup/helper/StringBuilderRecycler.java new file mode 100644 index 0000000000..35f6adbb0c --- /dev/null +++ b/src/main/java/org/jsoup/helper/StringBuilderRecycler.java @@ -0,0 +1,23 @@ +package org.jsoup.helper; + +import java.util.ArrayList; + +public class StringBuilderRecycler { + protected final ArrayList stringBuilders = new ArrayList<>(); + + public StringBuilder get(int minSize) { + if (!stringBuilders.isEmpty()) { + StringBuilder stringBuilder = stringBuilders.remove(stringBuilders.size() - 1); + // Too small string builders are thrown away + if (stringBuilder.capacity() >= minSize) { + stringBuilder.setLength(0); + return stringBuilder; + } + } + return new StringBuilder(minSize); + } + + public void releaseByteBuffer(StringBuilder stringBuilder) { + stringBuilders.add(stringBuilder); + } +} diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index be00b7f63c..3d0771b73d 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -1,16 +1,33 @@ package org.jsoup.parser; +import org.jsoup.helper.StringBuilderRecycler; import org.jsoup.helper.Validate; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Entities; import javax.annotation.Nullable; + +import java.lang.ref.SoftReference; import java.util.Arrays; /** * Readers the input stream into tokens. */ final class Tokeniser { + protected static final ThreadLocal> stringCacheRef = new ThreadLocal<>(); + + public static StringBuilderRecycler getStringBuilderCache() { + SoftReference ref = stringCacheRef.get(); + StringBuilderRecycler br = (ref == null) ? null : ref.get(); + + if (br == null) { + br = new StringBuilderRecycler(); + ref = new SoftReference<>(br); + stringCacheRef.set(ref); + } + return br; + } + static final char replacementChar = '\uFFFD'; // replaces null character private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'}; @@ -37,8 +54,8 @@ final class Tokeniser { @Nullable private Token emitPending = null; // the token we are about to emit on next read private boolean isEmitPending = false; @Nullable private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one - private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read - StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for + private StringBuilder charsBuilder = getStringBuilderCache().get(1024); // buffers characters to output as one token, if more than one emit per read + StringBuilder dataBuffer = getStringBuilderCache().get(1024); // buffers data looking for Token.StartTag startPending = new Token.StartTag(); Token.EndTag endPending = new Token.EndTag(); @@ -360,4 +377,10 @@ String unescapeEntities(boolean inAttribute) { } return StringUtil.releaseBuilder(builder); } + + void release() { + StringBuilderRecycler stringBuilderCache = getStringBuilderCache(); + stringBuilderCache.releaseByteBuffer(charsBuilder); + stringBuilderCache.releaseByteBuffer(dataBuffer); + } } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 902cbdf0ae..877cce82d8 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -63,6 +63,7 @@ Document parse(Reader input, String baseUri, Parser parser) { // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments reader.close(); reader = null; + tokeniser.release(); tokeniser = null; stack = null; seenTags = null;