Skip to content

Commit

Permalink
Spec changes - Tokenizer: BOM should be stripped by the decoder and s…
Browse files Browse the repository at this point in the history
…hould be processed as is by tokenizer

ref: html5lib/html5lib-tests#2
  • Loading branch information
inikulin committed Sep 1, 2015
1 parent 1496660 commit 7804e49
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 54 deletions.
95 changes: 47 additions & 48 deletions lib/common/unicode.js
@@ -1,48 +1,47 @@
'use strict';

exports.REPLACEMENT_CHARACTER = '\uFFFD';

exports.CODE_POINTS = {
EOF: -1,
NULL: 0x00,
TABULATION: 0x09,
CARRIAGE_RETURN: 0x0D,
LINE_FEED: 0x0A,
FORM_FEED: 0x0C,
SPACE: 0x20,
EXCLAMATION_MARK: 0x21,
QUOTATION_MARK: 0x22,
NUMBER_SIGN: 0x23,
AMPERSAND: 0x26,
APOSTROPHE: 0x27,
HYPHEN_MINUS: 0x2D,
SOLIDUS: 0x2F,
DIGIT_0: 0x30,
DIGIT_9: 0x39,
SEMICOLON: 0x3B,
LESS_THAN_SIGN: 0x3C,
EQUALS_SIGN: 0x3D,
GREATER_THAN_SIGN: 0x3E,
QUESTION_MARK: 0x3F,
LATIN_CAPITAL_A: 0x41,
LATIN_CAPITAL_F: 0x46,
LATIN_CAPITAL_X: 0x58,
LATIN_CAPITAL_Z: 0x5A,
GRAVE_ACCENT: 0x60,
LATIN_SMALL_A: 0x61,
LATIN_SMALL_F: 0x66,
LATIN_SMALL_X: 0x78,
LATIN_SMALL_Z: 0x7A,
BOM: 0xFEFF,
REPLACEMENT_CHARACTER: 0xFFFD
};

exports.CODE_POINT_SEQUENCES = {
DASH_DASH_STRING: [0x2D, 0x2D], //--
DOCTYPE_STRING: [0x44, 0x4F, 0x43, 0x54, 0x59, 0x50, 0x45], //DOCTYPE
CDATA_START_STRING: [0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B], //[CDATA[
CDATA_END_STRING: [0x5D, 0x5D, 0x3E], //]]>
SCRIPT_STRING: [0x73, 0x63, 0x72, 0x69, 0x70, 0x74], //script
PUBLIC_STRING: [0x50, 0x55, 0x42, 0x4C, 0x49, 0x43], //PUBLIC
SYSTEM_STRING: [0x53, 0x59, 0x53, 0x54, 0x45, 0x4D] //SYSTEM
};
'use strict';

exports.REPLACEMENT_CHARACTER = '\uFFFD';

exports.CODE_POINTS = {
EOF: -1,
NULL: 0x00,
TABULATION: 0x09,
CARRIAGE_RETURN: 0x0D,
LINE_FEED: 0x0A,
FORM_FEED: 0x0C,
SPACE: 0x20,
EXCLAMATION_MARK: 0x21,
QUOTATION_MARK: 0x22,
NUMBER_SIGN: 0x23,
AMPERSAND: 0x26,
APOSTROPHE: 0x27,
HYPHEN_MINUS: 0x2D,
SOLIDUS: 0x2F,
DIGIT_0: 0x30,
DIGIT_9: 0x39,
SEMICOLON: 0x3B,
LESS_THAN_SIGN: 0x3C,
EQUALS_SIGN: 0x3D,
GREATER_THAN_SIGN: 0x3E,
QUESTION_MARK: 0x3F,
LATIN_CAPITAL_A: 0x41,
LATIN_CAPITAL_F: 0x46,
LATIN_CAPITAL_X: 0x58,
LATIN_CAPITAL_Z: 0x5A,
GRAVE_ACCENT: 0x60,
LATIN_SMALL_A: 0x61,
LATIN_SMALL_F: 0x66,
LATIN_SMALL_X: 0x78,
LATIN_SMALL_Z: 0x7A,
REPLACEMENT_CHARACTER: 0xFFFD
};

exports.CODE_POINT_SEQUENCES = {
DASH_DASH_STRING: [0x2D, 0x2D], //--
DOCTYPE_STRING: [0x44, 0x4F, 0x43, 0x54, 0x59, 0x50, 0x45], //DOCTYPE
CDATA_START_STRING: [0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B], //[CDATA[
CDATA_END_STRING: [0x5D, 0x5D, 0x3E], //]]>
SCRIPT_STRING: [0x73, 0x63, 0x72, 0x69, 0x70, 0x74], //script
PUBLIC_STRING: [0x50, 0x55, 0x42, 0x4C, 0x49, 0x43], //PUBLIC
SYSTEM_STRING: [0x53, 0x59, 0x53, 0x54, 0x45, 0x4D] //SYSTEM
};
7 changes: 1 addition & 6 deletions lib/tokenizer/preprocessor.js
Expand Up @@ -76,14 +76,9 @@ Preprocessor.prototype.write = function (chunk, isLastChunk) {
if (this.html)
this.html += chunk;

else {
else
this.html = chunk;

//NOTE: one leading U+FEFF BYTE ORDER MARK character must be ignored if any are present in the input stream.
if (this.html.charCodeAt(0) === $.BOM)
this.pos = 0;
}

this.lastCharPos = this.html.length - 1;
this.endOfChunkHit = false;
this.lastChunkWritten = isLastChunk;
Expand Down

0 comments on commit 7804e49

Please sign in to comment.