Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

We’re showing branches in this repository, but you can also compare across forks.

base fork: jhy/jsoup
base: 87394199b2
...
head fork: jhy/jsoup
compare: 82f8683129
  • 2 commits
  • 4 files changed
  • 0 commit comments
  • 1 contributor
3  CHANGES
View
@@ -20,6 +20,9 @@ jsoup changelog
* Fixed an issue that prevented frameset documents to be cleaned by the Cleaner.
<https://github.com/jhy/jsoup/issues/154>
+ * Fixed an issue when normalising whitespace for strings containing high-surrogate characters.
+ <https://github.com/jhy/jsoup/issues/214>
+
*** Release 1.6.3 [2012-May-28]
* Fixed parsing of group-or commas in CSS selectors, to correctly handle sub-queries containing commas.
<https://github.com/jhy/jsoup/issues/179>
2  pom.xml
View
@@ -75,7 +75,9 @@
</executions>
</plugin>
<plugin>
+ <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
+ <version>2.2</version>
<configuration>
<archive>
<manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
5 src/main/java/org/jsoup/helper/StringUtil.java
View
@@ -110,8 +110,9 @@ public static String normaliseWhitespace(String string) {
boolean modified = false;
int l = string.length();
- for (int i = 0; i < l; i++) {
- int c = string.codePointAt(i);
+ int c;
+ for (int i = 0; i < l; i+= Character.charCount(c)) {
+ c = string.codePointAt(i);
if (isWhitespace(c)) {
if (lastWasWhite) {
modified = true;
10 src/test/java/org/jsoup/helper/StringUtilTest.java
View
@@ -1,5 +1,6 @@
package org.jsoup.helper;
+import org.jsoup.Jsoup;
import org.junit.Test;
import java.util.Arrays;
@@ -73,4 +74,13 @@
assertTrue(check2 != StringUtil.normaliseWhitespace(check2));
assertTrue(check3 != StringUtil.normaliseWhitespace(check3));
}
+
+ @Test public void normaliseWhiteSpaceHandlesHighSurrogates() {
+ String test71540chars = "\ud869\udeb2\u304b\u309a 1";
+ String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1";
+
+ assertEquals(test71540charsExpectedSingleWhitespace, StringUtil.normaliseWhitespace(test71540chars));
+ String extractedText = Jsoup.parse(test71540chars).text();
+ assertEquals(test71540charsExpectedSingleWhitespace, extractedText);
+ }
}

No commit comments for this range

Something went wrong with that request. Please try again.