Skip to content
Permalink
Browse files
Parse invalid unicode escapes as Windows-1252 instead [Fixes #1034]
  • Loading branch information
yawkat committed Apr 15, 2018
1 parent 0f7e0cc commit 3c699bad0467bec4adc55666f0d02e312ab22a60
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
@@ -13,6 +13,17 @@ final class Tokeniser {
static final char replacementChar = '\uFFFD'; // replaces null character
private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};

// Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
static final int win1252ExtensionsStart = 0x80;
static final int[] win1252Extensions = new int[] {
// we could build this manually, but Windows-1252 is not a standard java charset so that could break on
// some platforms - this table is verified with a test
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
};

static {
Arrays.sort(notCharRefCharsSorted);
}
@@ -148,6 +159,12 @@ int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean in
codeRef[0] = replacementChar;
return codeRef;
} else {
// fix illegal unicode characters to match browser behavior
if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
characterReferenceError("character is not a valid unicode code point");
charval = win1252Extensions[charval - win1252ExtensionsStart];
}

// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = charval;
@@ -1,5 +1,6 @@
package org.jsoup.parser;

import java.io.UnsupportedEncodingException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Comment;
@@ -154,4 +155,29 @@ public void bufferUpInAttributeVal() {
assertEquals(title, child.getWholeText());
assertEquals(title, doc.title());
}

@Test public void cp1252Entities() {
assertEquals("\u20ac", Jsoup.parse("&#0128;").text());
assertEquals("\u201a", Jsoup.parse("&#0130;").text());
assertEquals("\u20ac", Jsoup.parse("&#x80;").text());
}

@Test public void cp1252EntitiesProduceError() {
Parser parser = new Parser(new HtmlTreeBuilder());
parser.setTrackErrors(10);
assertEquals("\u20ac", parser.parseInput("<html><body>&#0128;</body></html>", "").text());
assertEquals(1, parser.getErrors().size());
}

@Test public void cp1252SubstitutionTable() throws UnsupportedEncodingException {
for (int i = 0; i < Tokeniser.win1252Extensions.length; i++) {
String s = new String(new byte[]{ (byte) (i + Tokeniser.win1252ExtensionsStart) }, "Windows-1252");
assertEquals(1, s.length());

// some of these characters are illegal
if (s.charAt(0) == '\ufffd') { continue; }

assertEquals("At: " + i, s.charAt(0), Tokeniser.win1252Extensions[i]);
}
}
}

0 comments on commit 3c699ba

Please sign in to comment.