Permalink
Browse files

Removed unused parameters, furhter optimization to _string.

  • Loading branch information...
joa committed Apr 3, 2012
1 parent 5e53c73 commit 4cb2c25b0f9cce1489589f7fd604f94d9e213ebe
@@ -31,22 +31,22 @@ final class HtmlEmitterBenchmark extends HectorBenchmark {
def timeHtmlEmitterWithTrimAndStripComments(reps: Int) =
benchmark(reps) {
- HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = true, trim = true, humanReadable = false, omitDocType = false)
+ HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = true, trim = true, humanReadable = false)
}
def timeHtmlEmitterWithTrim(reps: Int) =
benchmark(reps) {
- HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = false, trim = true, humanReadable = false, omitDocType = false)
+ HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = false, trim = true, humanReadable = false)
}
def timeHtmlEmitterWithStripComments(reps: Int) =
benchmark(reps) {
- HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = true, trim = false, humanReadable = false, omitDocType = false)
+ HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = true, trim = false, humanReadable = false)
}
def timeHtmlEmitter(reps: Int) =
benchmark(reps) {
- HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = false, trim = false, humanReadable = false, omitDocType = false)
+ HtmlEmitter.toString(data, DocTypes.`HTML 5`, stripComments = false, trim = false, humanReadable = false)
}
def timeToString(reps: Int) =
@@ -10,11 +10,55 @@ import javax.annotation.concurrent.ThreadSafe
import scala.xml._
-
/**
+ * The HtmlEmitter is used to emit a string for a given Node.
+ *
+ * <p>The actual implementation will make sure that:
+ * <ul>
+ * <li>Text is escaped.</li>
+ * <li>Output contains only of valid characters. Invalid data is ignored.</li>
+ * <li>Consecutive whitespace is trimmed</li>
+ * <li>Invalid tag names are replaced with "invalidName".</li>
+ * <li>Invalid attribute names are replaced with "invalidName".</li>
+ * <li>Invalid prefix names are replaced with "invalidName".</li>
+ * <li>Output contains only valid entity references. Invalid data is ignored.</li>
+ * <li>Output is formatted according to given DTD.</li>
+ * </ul>
+ * </p>
+ *
+ * <p>An invalid tag name would be <code>Elem("foo bar", ...)</code> where "foo bar" is an
+ * invalid name. The same applies for attribute values and prefixes. Although when using
+ * XML literals it is impossible to create such malformed data it is an extra security check
+ * in order to prevent mistakes like <code>Elem(loadSomethingFromDB(), ...)</code>.</p>
+ *
+ * <p>The HtmlEmitter is also very strict when it comes to entity references such as &amp;amp. If
+ * a given entity name is not known or not of the form &amp;#UUUU; or &amp;#xHHHH; it is ignored.</p>
+ *
+ * <p>Furthermore the HtmlEmitter can pretty-print any data with little impact on performance. Microbenchmark
+ * scores using real-world data suggest that using <code>stripComments</code> and <code>trim</code>
+ * have little to no performance impact. In fact using <code>trim</code> might increase performance
+ * since less data needs to be checked for escaping.</p>
+ *
+ * <p>When using <code>trim</code> and <code>humanReadable</code> the HtmlEmitter will generate
+ * pretty html output that is easy on the eye, ignoring any indentation in the actual Scala code.</p>
*/
@ThreadSafe
object HtmlEmitter {
+ //
+ // Note:
+ // Please do not try to replace the while-loops in this code with a foreach-equivalent.
+ // HtmlEmitter may contain code duplicates (in _string for instance) which are only there
+ // in order to improve performance and not have any unnecessary object allocations.
+ //
+ // If you make any changes to this file you must run the HtmlEmitterBenchmark for it.
+ // To do so you can use "hector-microbenchmark/run" from within SBT.
+ //
+ // Also check the generated bytecode with "javap -c -p ..." and make sure that no object
+ // allocations appear in any "_method" or "visit".
+ //
+ // You might also be surprised about the fact that _string performs actual escaping and
+ // that we do not rely on
+
private[this] val CharsCommentOpen = "<!--".toCharArray
private[this] val CharsCommentClose = "-->".toCharArray
@@ -37,20 +81,27 @@ object HtmlEmitter {
private[this] val CharsEscapedQuot = "&quot;".toCharArray
- def toString(html: Node, docType: DocType = DocTypes.`HTML 5`, stripComments: Boolean = false, trim: Boolean = false, humanReadable: Boolean = false, omitDocType: Boolean = false): String = {
- val stringBuilder = new StringBuilder()
+ def toString(html: Node, docType: DocType = DocTypes.`HTML 5`, stripComments: Boolean = false, trim: Boolean = false, humanReadable: Boolean = false): String = {
+ //
+ // See https://developers.google.com/speed/articles/web-metrics
+ //
+ // I also read that the average character data is at 20k bytes but I am missing the
+ // source for that information. However 20.000 lines up closely to other observed pages.
+ // You can try it via "curl http://... | wc -c"
+ //
+ // This size should be part of a configuration with the default being 20k.
+ //
+ val stringBuilder = new StringBuilder(20000) //TODO(joa): make me configurable.
val writer = new TextOutput(stringBuilder, humanReadable)
- if(!omitDocType) {
- _dtd(docType)(writer)
- }
+ _dtd(docType)(writer)
- visit(html, docType, stripComments, trim, humanReadable)(writer)
+ visit(html, docType, stripComments, trim)(writer)
stringBuilder.toString()
}
- private[this] def visit(node: Node, docType: DocType, stripComments: Boolean, trim: Boolean, humanReadable: Boolean)(implicit writer: TextOutput) {
+ private[this] def visit(node: Node, docType: DocType, stripComments: Boolean, trim: Boolean)(implicit writer: TextOutput) {
import scala.xml._
// Subsequent whitespace could be removed. This should be something we have to consider since
@@ -72,15 +123,15 @@ object HtmlEmitter {
if(children.isEmpty) {
_lt()
- _tag(prefix, label, attributes, scope, docType, stripComments, trim, humanReadable)
+ _tag(prefix, label, attributes, scope)
if(docType != DocTypes.`HTML 5` && docType != DocTypes.`XHTML 5`) {
_tagCloseShort()
} else {
_gt()
}
} else {
_lt()
- _tag(prefix, label, attributes, scope, docType, stripComments, trim, humanReadable)
+ _tag(prefix, label, attributes, scope)
_gt()
_newLineOpt()
@@ -89,7 +140,7 @@ object HtmlEmitter {
val iterator = children.iterator
while(iterator.hasNext) {
- visit(iterator.next(), docType, stripComments, trim, humanReadable)
+ visit(iterator.next(), docType, stripComments, trim)
}
_tagCloseLong(prefix, label)
@@ -100,7 +151,7 @@ object HtmlEmitter {
val iterator = nodes.iterator
while(iterator.hasNext) {
- visit(iterator.next(), docType, stripComments, trim, humanReadable)
+ visit(iterator.next(), docType, stripComments, trim)
}
case Unparsed(data)
@@ -110,7 +161,9 @@ object HtmlEmitter {
_entity(name)
case PCData(data)
- // So this is <![CDATA[data]]> but not PCData because PCData is Atom apparently?!
+ //
+ // This is <![CDATA[data]]> but not PCData because PCData is Atom apparently
+ //
_cdataOpen()
writer.print(data)
_cdataClose()
@@ -133,8 +186,10 @@ object HtmlEmitter {
_newLineOpt()
case atom: Atom[_]
+ //
// Apparently someone decided to name PCDATA not PCDATA but Atom.
// So we will have to treat it like parsed character data.
+ //
_string(atom.data.toString, trim)
}
}
@@ -145,7 +200,7 @@ object HtmlEmitter {
_newLine()
}
- private[this] def _tag(@Nullable prefix: String, label: String, @Nullable attributes: MetaData, scope: NamespaceBinding, docType: DocType, stripComments: Boolean, trim: Boolean, humanReadable: Boolean)(implicit writer: TextOutput) {
+ private[this] def _tag(@Nullable prefix: String, label: String, @Nullable attributes: MetaData, scope: NamespaceBinding)(implicit writer: TextOutput) {
if(null != prefix) {
_prefix(prefix)
_colon()
@@ -164,11 +219,10 @@ object HtmlEmitter {
while(null != iterator && Null != iterator) {
//
- // Apparently calling hasNext would lead to an error and instead we must check
- // for Null.
+ // Calling hasNext would lead to an error and instead we must check for Null.
+ //
+ // Because we tested for Null in the header the cast is safe here.
//
-
- // We tested for Null in the header so the cast is safe.
val attribute = iterator.asInstanceOf[Attribute]
iterator = iterator.next
@@ -330,42 +384,103 @@ object HtmlEmitter {
val n = chars.length
var i = 0
- //TODO(joa): to avoid a whole bunch of method calls we should evaluate valid indices and print a batch
+ var indexStart = -1
+
+ //
+ // The escape method is a little bit special but easy to understand.
+ //
+ // 1) Print-Characters Only (' ' <= x <= '~'):
+ //
+ // Example: ['f', 'o', 'o'].
+ //
+ // In the first iteration when looking at 'f' we match a print character and set indexStart
+ // to 0 because it was -1. Now for the following characters 'o' and 'o' nothing happens.
+ //
+ // After the loop the range [indexStart, n) will be printed from the sequence of
+ // characters if indexStart is not -1. This will create only one call to
+ // writer.print().
+ //
+ // 2) Trivial Escape Characters (\n, \r\, \t):
+ //
+ // Example: ['f', 'o', 'o', '\n', 'b', 'a', 'r']
+ //
+ // In the first iteration when looking at 'f' we set indexStart to 0. Once we encounter
+ // the escape character '\n' we perform no special action since it is valid and has
+ // no escaped HTML entity. We continue processing like in case (1).
+ // Only one call to writer.print() will be made.
+ //
+ // 3) Non-Trivial Escape Characters (<, >, &, "):
+ //
+ // Example: ['f', 'o', 'o', '&', 'b', 'a', 'r']
+ //
+ // In the first iteration when looking at 'f' we set indexStart to 0. When we reach the
+ // '&' character the following happens:
+ //
+ // - Print all character data in the range [indexStart, i) if indexStart != -1
+ // - Set i to -1 so that the next trivial character will mark it
+ // - Print the HTML entity &amp; instead of &.
+ //
+ // This will lead to less calls to writer.print. In fact each escape character requires
+ // an additional 2 calls so we have (1 + 2 * numEntities) calls to writer.print in the worst
+ // case.
+ //
while(i < n) {
val char = chars(i)
char match {
case '<'
+ if(-1 != indexStart) {
+ print(chars, indexStart, i - indexStart)
+ indexStart = -1
+ }
+
print(CharsEscapedLt)
case '>'
+ if(-1 != indexStart) {
+ print(chars, indexStart, i - indexStart)
+ indexStart = -1
+ }
+
print(CharsEscapedGt)
case '&'
+ if(-1 != indexStart) {
+ print(chars, indexStart, i - indexStart)
+ indexStart = -1
+ }
+
print(CharsEscapedAmp)
case '"'
- print(CharsEscapedQuot)
-
- case '\n'
- print('\n')
+ if(-1 != indexStart) {
+ print(chars, indexStart, i - indexStart)
+ indexStart = -1
+ }
- case '\r'
- print('\r')
+ print(CharsEscapedQuot)
- case '\t'
- print('\t')
+ case '\n' | '\r' | '\t'
+ if(-1 == indexStart) {
+ indexStart = i
+ }
case printChar if ' ' <= printChar && printChar <= '~'
- print(printChar)
+ if(-1 == indexStart) {
+ indexStart = i
+ }
case _
//TODO(joa): notify developer?
}
i += 1
}
+
+ if(-1 != indexStart) {
+ print(chars, indexStart, n - indexStart)
+ }
}
private[this] def _and()(implicit writer: TextOutput) {
@@ -78,7 +78,7 @@ trait HttpResponse extends Serializable {
}
final case class HtmlResponse(html: Node, docType: DocType = DocTypes.`HTML 5`, status: Int = 200, cookies: Seq[HttpCookie] = Seq.empty, headers: Seq[HttpHeader] = Seq.empty, characterEncoding: Option[JCharset] = None) extends HttpResponse {
- @transient private[this] lazy val htmlAsString = HtmlEmitter.toString(html, docType, stripComments = false, trim = true, humanReadable = false, omitDocType = false)
+ @transient private[this] lazy val htmlAsString = HtmlEmitter.toString(html, docType, stripComments = false, trim = true, humanReadable = false)
override def contentType = MimeType.text.html
@@ -68,6 +68,12 @@ final class TextOutput(
wasNewline = false
}
+ def print(value: Array[Char], offset: Int, length: Int) {
+ maybeIndent()
+ builder.appendAll(value, offset, length)
+ wasNewline = false
+ }
+
def print(value: String) {
print(value.toCharArray)
}

0 comments on commit 4cb2c25

Please sign in to comment.