Skip to content

Commit

Permalink
Reimplementation of parser and tokeniser, to make jsoup a HTML5 confo…
Browse files Browse the repository at this point in the history
…rmat parser, against the

http://whatwg.org/html spec.
  • Loading branch information
jhy committed Jul 2, 2011
1 parent 481542f commit 8749726
Show file tree
Hide file tree
Showing 27 changed files with 5,295 additions and 816 deletions.
8 changes: 8 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
jsoup changelog

*** Release 1.6.0 [PENDING]
* HTML5 conformant parser. Complete reimplemenation of HTML tokenisation and parsing, to implement the
http://whatwg.org/html spec. This ensures jsoup parses HTML identically to current modern browsers.

* When parsing files from disk, files are loaded via memory mapping, to increase parse speed.

* Reduced memory overhead and lowered garbage collector pressure with Attribute, Node and Entity model optimisations.

*** Release 1.5.2 [2011-02-27]
* Fixed issue with selector parser where some boolean AND + OR combined queries (e.g. "meta[http-equiv], meta[content]")
were being parsed incorrectly as OR only queries (e.g. former as "meta, [http-equiv], meta[content]")
Expand Down
7 changes: 5 additions & 2 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -30,10 +32,11 @@ private DataUtil() {}
* @throws IOException on IO error
*/
public static Document load(File in, String charsetName, String baseUri) throws IOException {
InputStream inStream = null;
FileInputStream inStream = null;
try {
inStream = new FileInputStream(in);
return load(inStream, charsetName, baseUri);
MappedByteBuffer byteData = inStream.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, in.length());
return parseByteData(byteData, charsetName, baseUri);
} finally {
if (inStream != null)
inStream.close();
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/org/jsoup/helper/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,12 @@ public static String normaliseWhitespace(String string) {
}
return modified ? sb.toString() : string;
}

public static boolean in(String needle, String... haystack) {
for (String hay : haystack) {
if (hay.equals(needle))
return true;
}
return false;
}
}
27 changes: 27 additions & 0 deletions src/main/java/org/jsoup/helper/Validate.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,25 @@ public static void isTrue(boolean val, String msg) {
throw new IllegalArgumentException(msg);
}

/**
* Validates that the value is false
* @param val object to test
*/
public static void isFalse(boolean val) {
if (val)
throw new IllegalArgumentException("Must be false");
}

/**
* Validates that the value is false
* @param val object to test
* @param msg message to output if validation fails
*/
public static void isFalse(boolean val, String msg) {
if (val)
throw new IllegalArgumentException(msg);
}

/**
* Validates that the array contains no null elements
* @param objects the array to test
Expand Down Expand Up @@ -82,4 +101,12 @@ public static void notEmpty(String string, String msg) {
if (string == null || string.length() == 0)
throw new IllegalArgumentException(msg);
}

/**
Cause a failure.
@param msg message to output.
*/
public static void fail(String msg) {
throw new IllegalArgumentException(msg);
}
}
34 changes: 31 additions & 3 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
public class Attributes implements Iterable<Attribute>, Cloneable {
protected static final String dataPrefix = "data-";

private LinkedHashMap<String, Attribute> attributes = new LinkedHashMap<String, Attribute>(2);
private LinkedHashMap<String, Attribute> attributes = null;
// linked hash map to preserve insertion order.
// null be default as so many elements have no attributes -- saves a good chunk of memory

/**
Get an attribute value by key.
Expand All @@ -28,7 +29,10 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
*/
public String get(String key) {
Validate.notEmpty(key);


if (attributes == null)
return "";

Attribute attr = attributes.get(key.toLowerCase());
return attr != null ? attr.getValue() : "";
}
Expand All @@ -49,6 +53,8 @@ public void put(String key, String value) {
*/
public void put(Attribute attribute) {
Validate.notNull(attribute);
if (attributes == null)
attributes = new LinkedHashMap<String, Attribute>(2);
attributes.put(attribute.getKey(), attribute);
}

Expand All @@ -58,6 +64,8 @@ public void put(Attribute attribute) {
*/
public void remove(String key) {
Validate.notEmpty(key);
if (attributes == null)
return;
attributes.remove(key.toLowerCase());
}

Expand All @@ -67,14 +75,16 @@ public void remove(String key) {
@return true if key exists, false otherwise
*/
public boolean hasKey(String key) {
return attributes.containsKey(key.toLowerCase());
return attributes != null && attributes.containsKey(key.toLowerCase());
}

/**
Get the number of attributes in this set.
@return size
*/
public int size() {
if (attributes == null)
return 0;
return attributes.size();
}

Expand All @@ -83,6 +93,10 @@ public int size() {
@param incoming attributes to add to these attributes.
*/
public void addAll(Attributes incoming) {
if (incoming.size() == 0)
return;
if (attributes == null)
attributes = new LinkedHashMap<String, Attribute>(incoming.size());
attributes.putAll(incoming.attributes);
}

Expand All @@ -96,6 +110,9 @@ public Iterator<Attribute> iterator() {
@return an view of the attributes as a List.
*/
public List<Attribute> asList() {
if (attributes == null)
return Collections.emptyList();

List<Attribute> list = new ArrayList<Attribute>(attributes.size());
for (Map.Entry<String, Attribute> entry : attributes.entrySet()) {
list.add(entry.getValue());
Expand Down Expand Up @@ -123,6 +140,9 @@ public String html() {
}

void html(StringBuilder accum, Document.OutputSettings out) {
if (attributes == null)
return;

for (Map.Entry<String, Attribute> entry : attributes.entrySet()) {
Attribute attribute = entry.getValue();
accum.append(" ");
Expand Down Expand Up @@ -153,6 +173,9 @@ public int hashCode() {

@Override
public Attributes clone() {
if (attributes == null)
return new Attributes();

Attributes clone;
try {
clone = (Attributes) super.clone();
Expand All @@ -167,6 +190,11 @@ public Attributes clone() {

private class Dataset extends AbstractMap<String, String> {

private Dataset() {
if (attributes == null)
attributes = new LinkedHashMap<String, Attribute>(2);
}

public Set<Entry<String, String>> entrySet() {
return new EntrySet();
}
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
@author Jonathan Hedley, jonathan@hedley.net */
public class Document extends Element {
private OutputSettings outputSettings = new OutputSettings();
private QuirksMode quirksMode = QuirksMode.noQuirks;

/**
Create a new, empty Document.
Expand Down Expand Up @@ -332,5 +333,18 @@ public OutputSettings clone() {
public OutputSettings outputSettings() {
return outputSettings;
}

public enum QuirksMode {
noQuirks, quirks, limitedQuirks;
}

public QuirksMode quirksMode() {
return quirksMode;
}

public Document quirksMode(QuirksMode quirksMode) {
this.quirksMode = quirksMode;
return this;
}
}

39 changes: 39 additions & 0 deletions src/main/java/org/jsoup/nodes/DocumentType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jsoup.nodes;

import org.jsoup.helper.StringUtil;

/**
* A {@code <!DOCTPYE>} node.
*/
public class DocumentType extends Node {
// todo: quirk mode from publicId and systemId

private DocumentType() {}

public DocumentType(String name, String publicId, String systemId, String baseUri) {
super(baseUri);

attr("name", name);
attr("publicId", publicId);
attr("systemId", systemId);
}

@Override
public String nodeName() {
return "#doctype";
}

@Override
void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
accum.append("<!DOCTYPE html");
if (!StringUtil.isBlank(attr("publicId")))
accum.append(" PUBLIC \"").append(attr("publicId")).append("\"");
if (!StringUtil.isBlank(attr("systemId")))
accum.append(' ').append(attr("systemId")).append("\"");
accum.append('>');
}

@Override
void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
}
}
47 changes: 31 additions & 16 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ public Elements select(String query) {
}

/**
* Add a node to the last child of this element.
* Add a node child node to this element.
*
* @param child node to add. Must not already have a parent.
* @return this element, so that you can add more child nodes or elements.
Expand Down Expand Up @@ -297,9 +297,9 @@ public Element prependText(String text) {
*/
public Element append(String html) {
Validate.notNull(html);
Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
addChildren(fragment.childNodesAsArray());

List<Node> nodes = Parser.parseFragment(html, this, baseUri());
addChildren(nodes.toArray(new Node[nodes.size()]));
return this;
}

Expand All @@ -312,8 +312,8 @@ public Element append(String html) {
public Element prepend(String html) {
Validate.notNull(html);

Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
addChildren(0, fragment.childNodesAsArray());
List<Node> nodes = Parser.parseFragment(html, this, baseUri());
addChildren(0, nodes.toArray(new Node[nodes.size()]));
return this;
}

Expand All @@ -329,6 +329,17 @@ public Element before(String html) {
return (Element) super.before(html);
}

/**
* Insert the specified node into the DOM before this node (i.e. as a preceeding sibling).
* @param node to add before this element
* @return this Element, for chaining
* @see #after(Node)
*/
@Override
public Element before(Node node) {
return (Element) super.before(node);
}

/**
* Insert the specified HTML into the DOM after this element (i.e. as a following sibling).
*
Expand All @@ -341,6 +352,17 @@ public Element after(String html) {
return (Element) super.after(html);
}

/**
* Insert the specified node into the DOM after this node (i.e. as a following sibling).
* @param node to add after this element
* @return this element, for chaining
* @see #before(Node)
*/
@Override
public Element after(Node node) {
return (Element) super.after(node);
}

/**
* Remove all of the element's child nodes. Any attributes are left as-is.
* @return this element
Expand Down Expand Up @@ -848,7 +870,7 @@ public String data() {
* @return The literal class attribute, or <b>empty string</b> if no class attribute set.
*/
public String className() {
return attributes.hasKey("class") ? attributes.get("class") : "";
return attr("class");
}

/**
Expand Down Expand Up @@ -1020,19 +1042,12 @@ public String toString() {

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Element)) return false;
if (!super.equals(o)) return false;

Element element = (Element) o;

if (tag != null ? !tag.equals(element.tag) : element.tag != null) return false;

return true;
return this == o;
}

@Override
public int hashCode() {
// todo: fixup, not very useful
int result = super.hashCode();
result = 31 * result + (tag != null ? tag.hashCode() : 0);
return result;
Expand Down

0 comments on commit 8749726

Please sign in to comment.