Skip to content

Commit

Permalink
Report the correct error position in some malformed constructs
Browse files Browse the repository at this point in the history
Merges #1253

Fixes #1251

Authored bt @csaboka, but I rewrote the test file to fix the diff up.
  • Loading branch information
jhy committed Jan 27, 2020
1 parent de97030 commit c2b1fe7
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public final class CharacterReader {
static final char EOF = (char) -1;
private static final int maxStringCacheLen = 12;
static final int maxBufferLen = 1024 * 32; // visible for testing
private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing
private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this.

private final char[] charBuf;
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ void read(Tokeniser t, CharacterReader r) {
t.transition(SelfClosingStartTag);
break;
case '<': // NOTE: out of spec, but clear author intent
t.error(this);
r.unconsume();
t.error(this);
// intended fall through to next >
case '>':
t.emitTagPending();
Expand Down Expand Up @@ -572,17 +572,17 @@ void read(Tokeniser t, CharacterReader r) {
t.transition(SelfClosingStartTag);
break;
case '<': // NOTE: out of spec, but clear (spec has this as a part of the attribute name)
t.error(this);
r.unconsume();
t.error(this);
// intended fall through as if >
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
r.unconsume();
t.error(this);
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
break;
case eof:
Expand Down Expand Up @@ -880,8 +880,8 @@ void read(Tokeniser t, CharacterReader r) {
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.error(this);
t.transition(BeforeAttributeName);
}

Expand All @@ -901,8 +901,8 @@ void read(Tokeniser t, CharacterReader r) {
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.error(this);
t.transition(BeforeAttributeName);
}
}
Expand Down
23 changes: 23 additions & 0 deletions src/test/java/org/jsoup/parser/ParserIT.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package org.jsoup.parser;

import org.junit.Test;

/**
* Longer running Parser tests.
*/

public class ParserIT {
@Test
public void testIssue1251() {
// https://github.com/jhy/jsoup/issues/1251
StringBuilder str = new StringBuilder("<a href=\"\"ca");
for (int countSpaces = 0; countSpaces < 100000; countSpaces++) {
try {
Parser.htmlParser().setTrackErrors(1).parseInput(str.toString(), "");
} catch (Exception e) {
throw new AssertionError("failed at length " + str.length(), e);
}
str.insert(countSpaces, ' ');
}
}
}
43 changes: 43 additions & 0 deletions src/test/java/org/jsoup/parser/TokeniserStateTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,47 @@ public void testPublicAndSystemIdentifiersWithWhitespace() {
Document doc = Jsoup.parse(html);
assertEquals("<p></p><p></p><div id=\"one\"><span>Two</span></div>", TextUtil.stripNewlines(doc.body().html()));
}

@Test
public void testUnconsumeAtBufferBoundary() {
String triggeringSnippet = "<a href=\"\"foo";
char[] padding = new char[CharacterReader.readAheadLimit - triggeringSnippet.length() + 2]; // The "foo" part must be just at the limit.
Arrays.fill(padding, ' ');
String paddedSnippet = new StringBuilder().append(padding).append(triggeringSnippet).toString();
ParseErrorList errorList = ParseErrorList.tracking(1);

Parser.parseFragment(paddedSnippet, null, "", errorList);

assertEquals(CharacterReader.readAheadLimit - 1, errorList.get(0).getPosition());
}

@Test
public void testOpeningAngleBracketInsteadOfAttribute() {
String triggeringSnippet = "<html <";
ParseErrorList errorList = ParseErrorList.tracking(1);

Parser.parseFragment(triggeringSnippet, null, "", errorList);

assertEquals(6, errorList.get(0).getPosition());
}

@Test
public void testMalformedSelfClosingTag() {
String triggeringSnippet = "<html /ouch";
ParseErrorList errorList = ParseErrorList.tracking(1);

Parser.parseFragment(triggeringSnippet, null, "", errorList);

assertEquals(7, errorList.get(0).getPosition());
}

@Test
public void testOpeningAngleBracketInTagName() {
String triggeringSnippet = "<html<";
ParseErrorList errorList = ParseErrorList.tracking(1);

Parser.parseFragment(triggeringSnippet, null, "", errorList);

assertEquals(5, errorList.get(0).getPosition());
}
}

0 comments on commit c2b1fe7

Please sign in to comment.