Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: jhy/jsoup
base: 87394199b2
...
head fork: jhy/jsoup
compare: 82f8683129
  • 2 commits
  • 4 files changed
  • 0 commit comments
  • 1 contributor
View
3  CHANGES
@@ -20,6 +20,9 @@ jsoup changelog
* Fixed an issue that prevented frameset documents to be cleaned by the Cleaner.
<https://github.com/jhy/jsoup/issues/154>
+ * Fixed an issue when normalising whitespace for strings containing high-surrogate characters.
+ <https://github.com/jhy/jsoup/issues/214>
+
*** Release 1.6.3 [2012-May-28]
* Fixed parsing of group-or commas in CSS selectors, to correctly handle sub-queries containing commas.
<https://github.com/jhy/jsoup/issues/179>
View
2  pom.xml
@@ -75,7 +75,9 @@
</executions>
</plugin>
<plugin>
+ <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
+ <version>2.2</version>
<configuration>
<archive>
<manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
View
5 src/main/java/org/jsoup/helper/StringUtil.java
@@ -110,8 +110,9 @@ public static String normaliseWhitespace(String string) {
boolean modified = false;
int l = string.length();
- for (int i = 0; i < l; i++) {
- int c = string.codePointAt(i);
+ int c;
+ for (int i = 0; i < l; i+= Character.charCount(c)) {
+ c = string.codePointAt(i);
if (isWhitespace(c)) {
if (lastWasWhite) {
modified = true;
View
10 src/test/java/org/jsoup/helper/StringUtilTest.java
@@ -1,5 +1,6 @@
package org.jsoup.helper;
+import org.jsoup.Jsoup;
import org.junit.Test;
import java.util.Arrays;
@@ -73,4 +74,13 @@
assertTrue(check2 != StringUtil.normaliseWhitespace(check2));
assertTrue(check3 != StringUtil.normaliseWhitespace(check3));
}
+
+ @Test public void normaliseWhiteSpaceHandlesHighSurrogates() {
+ String test71540chars = "\ud869\udeb2\u304b\u309a 1";
+ String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1";
+
+ assertEquals(test71540charsExpectedSingleWhitespace, StringUtil.normaliseWhitespace(test71540chars));
+ String extractedText = Jsoup.parse(test71540chars).text();
+ assertEquals(test71540charsExpectedSingleWhitespace, extractedText);
+ }
}

No commit comments for this range

Something went wrong with that request. Please try again.