diff --git a/lib/common/unicode.js b/lib/common/unicode.js index a996dda88..8777e97ab 100644 --- a/lib/common/unicode.js +++ b/lib/common/unicode.js @@ -1,48 +1,47 @@ -'use strict'; - -exports.REPLACEMENT_CHARACTER = '\uFFFD'; - -exports.CODE_POINTS = { - EOF: -1, - NULL: 0x00, - TABULATION: 0x09, - CARRIAGE_RETURN: 0x0D, - LINE_FEED: 0x0A, - FORM_FEED: 0x0C, - SPACE: 0x20, - EXCLAMATION_MARK: 0x21, - QUOTATION_MARK: 0x22, - NUMBER_SIGN: 0x23, - AMPERSAND: 0x26, - APOSTROPHE: 0x27, - HYPHEN_MINUS: 0x2D, - SOLIDUS: 0x2F, - DIGIT_0: 0x30, - DIGIT_9: 0x39, - SEMICOLON: 0x3B, - LESS_THAN_SIGN: 0x3C, - EQUALS_SIGN: 0x3D, - GREATER_THAN_SIGN: 0x3E, - QUESTION_MARK: 0x3F, - LATIN_CAPITAL_A: 0x41, - LATIN_CAPITAL_F: 0x46, - LATIN_CAPITAL_X: 0x58, - LATIN_CAPITAL_Z: 0x5A, - GRAVE_ACCENT: 0x60, - LATIN_SMALL_A: 0x61, - LATIN_SMALL_F: 0x66, - LATIN_SMALL_X: 0x78, - LATIN_SMALL_Z: 0x7A, - BOM: 0xFEFF, - REPLACEMENT_CHARACTER: 0xFFFD -}; - -exports.CODE_POINT_SEQUENCES = { - DASH_DASH_STRING: [0x2D, 0x2D], //-- - DOCTYPE_STRING: [0x44, 0x4F, 0x43, 0x54, 0x59, 0x50, 0x45], //DOCTYPE - CDATA_START_STRING: [0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B], //[CDATA[ - CDATA_END_STRING: [0x5D, 0x5D, 0x3E], //]]> - SCRIPT_STRING: [0x73, 0x63, 0x72, 0x69, 0x70, 0x74], //script - PUBLIC_STRING: [0x50, 0x55, 0x42, 0x4C, 0x49, 0x43], //PUBLIC - SYSTEM_STRING: [0x53, 0x59, 0x53, 0x54, 0x45, 0x4D] //SYSTEM -}; +'use strict'; + +exports.REPLACEMENT_CHARACTER = '\uFFFD'; + +exports.CODE_POINTS = { + EOF: -1, + NULL: 0x00, + TABULATION: 0x09, + CARRIAGE_RETURN: 0x0D, + LINE_FEED: 0x0A, + FORM_FEED: 0x0C, + SPACE: 0x20, + EXCLAMATION_MARK: 0x21, + QUOTATION_MARK: 0x22, + NUMBER_SIGN: 0x23, + AMPERSAND: 0x26, + APOSTROPHE: 0x27, + HYPHEN_MINUS: 0x2D, + SOLIDUS: 0x2F, + DIGIT_0: 0x30, + DIGIT_9: 0x39, + SEMICOLON: 0x3B, + LESS_THAN_SIGN: 0x3C, + EQUALS_SIGN: 0x3D, + GREATER_THAN_SIGN: 0x3E, + QUESTION_MARK: 0x3F, + LATIN_CAPITAL_A: 0x41, + LATIN_CAPITAL_F: 0x46, + LATIN_CAPITAL_X: 0x58, + LATIN_CAPITAL_Z: 0x5A, + GRAVE_ACCENT: 0x60, + LATIN_SMALL_A: 0x61, + LATIN_SMALL_F: 0x66, + LATIN_SMALL_X: 0x78, + LATIN_SMALL_Z: 0x7A, + REPLACEMENT_CHARACTER: 0xFFFD +}; + +exports.CODE_POINT_SEQUENCES = { + DASH_DASH_STRING: [0x2D, 0x2D], //-- + DOCTYPE_STRING: [0x44, 0x4F, 0x43, 0x54, 0x59, 0x50, 0x45], //DOCTYPE + CDATA_START_STRING: [0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B], //[CDATA[ + CDATA_END_STRING: [0x5D, 0x5D, 0x3E], //]]> + SCRIPT_STRING: [0x73, 0x63, 0x72, 0x69, 0x70, 0x74], //script + PUBLIC_STRING: [0x50, 0x55, 0x42, 0x4C, 0x49, 0x43], //PUBLIC + SYSTEM_STRING: [0x53, 0x59, 0x53, 0x54, 0x45, 0x4D] //SYSTEM +}; diff --git a/lib/tokenizer/preprocessor.js b/lib/tokenizer/preprocessor.js index 8237b4701..e8d21ce78 100644 --- a/lib/tokenizer/preprocessor.js +++ b/lib/tokenizer/preprocessor.js @@ -76,14 +76,9 @@ Preprocessor.prototype.write = function (chunk, isLastChunk) { if (this.html) this.html += chunk; - else { + else this.html = chunk; - //NOTE: one leading U+FEFF BYTE ORDER MARK character must be ignored if any are present in the input stream. - if (this.html.charCodeAt(0) === $.BOM) - this.pos = 0; - } - this.lastCharPos = this.html.length - 1; this.endOfChunkHit = false; this.lastChunkWritten = isLastChunk;