From 8af00cac5b3d83248bcee92008fcf206a725b7ba Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Thu, 9 Aug 2018 18:10:03 +0100 Subject: [PATCH 1/6] Fix SAXParser text location Fixes #266. --- packages/parse5-sax-parser/lib/index.js | 7 ++++++- .../parse5-sax-parser/test/location-info.test.js | 14 +++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/packages/parse5-sax-parser/lib/index.js b/packages/parse5-sax-parser/lib/index.js index 208789a4b..7b930291e 100644 --- a/packages/parse5-sax-parser/lib/index.js +++ b/packages/parse5-sax-parser/lib/index.js @@ -87,7 +87,12 @@ class SAXParser extends Transform { if (this.pendingText === null) { this.currentTokenLocation = token.location; } else { - this.currentTokenLocation.endOffset = token.location.endOffset; + const { endLine, endCol, endOffset } = token.location; + Object.assign(this.currentTokenLocation, { + endLine, + endCol, + endOffset + }); } } diff --git a/packages/parse5-sax-parser/test/location-info.test.js b/packages/parse5-sax-parser/test/location-info.test.js index fecd31db6..bba6cbab5 100644 --- a/packages/parse5-sax-parser/test/location-info.test.js +++ b/packages/parse5-sax-parser/test/location-info.test.js @@ -29,16 +29,20 @@ exports['Location info (SAX)'] = function() { }); }; -exports['Regression - location info for text (GH-153)'] = function() { +exports['Regression - location info for text (GH-153, GH-266)'] = function() { const html = 'Here is a title'; const parser = new SAXParser({ sourceCodeLocationInfo: true }); - const texts = []; parser.on('text', ({ sourceCodeLocation }) => { - texts.push(html.substring(sourceCodeLocation.startOffset, sourceCodeLocation.endOffset)); + assert.deepStrictEqual(sourceCodeLocation, { + startLine: 1, + startCol: 35, + startOffset: 34, + endLine: 1, + endCol: 50, + endOffset: 49 + }); }); parser.end(html); - - assert.deepEqual(texts, ['Here is a title']); }; From b221f3a7e6462a4ae36c994bcc6eb16d68ad5d2a Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Thu, 9 Aug 2018 18:33:49 +0100 Subject: [PATCH 2/6] Move lazy emission logic to SAXParser --- .../parse5-html-rewriting-stream/lib/index.js | 52 ++------------ packages/parse5-sax-parser/lib/index.js | 72 +++++++++++++------ 2 files changed, 57 insertions(+), 67 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/lib/index.js b/packages/parse5-html-rewriting-stream/lib/index.js index 6de3224c8..3e80a4e5b 100644 --- a/packages/parse5-html-rewriting-stream/lib/index.js +++ b/packages/parse5-html-rewriting-stream/lib/index.js @@ -1,7 +1,6 @@ 'use strict'; const SAXParser = require('parse5-sax-parser'); -const Tokenizer = require('parse5/lib/tokenizer'); const { escapeString } = require('parse5/lib/serializer'); class RewritingStream extends SAXParser { @@ -9,25 +8,6 @@ class RewritingStream extends SAXParser { super({ sourceCodeLocationInfo: true }); this.posTracker = this.locInfoMixin.posTracker; - - this.tokenEmissionHelpers = { - [Tokenizer.START_TAG_TOKEN]: { - eventName: 'startTag', - reshapeToken: token => this._reshapeStartTagToken(token) - }, - [Tokenizer.END_TAG_TOKEN]: { - eventName: 'endTag', - reshapeToken: token => this._reshapeEndTagToken(token) - }, - [Tokenizer.COMMENT_TOKEN]: { - eventName: 'comment', - reshapeToken: token => this._reshapeCommentToken(token) - }, - [Tokenizer.DOCTYPE_TOKEN]: { - eventName: 'doctype', - reshapeToken: token => this._reshapeDoctypeToken(token) - } - }; } _transform(chunk, encoding, callback) { @@ -46,20 +26,8 @@ class RewritingStream extends SAXParser { // Events _handleToken(token) { - if (token.type === Tokenizer.EOF_TOKEN) { - return; - } - - const { eventName, reshapeToken } = this.tokenEmissionHelpers[token.type]; - - this.currentTokenLocation = token.location; - - const raw = this._getCurrentTokenRawHtml(); - - if (this.listenerCount(eventName) > 0) { - this.emit(eventName, reshapeToken(token), raw); - } else { - this.emitRaw(raw); + if (!super._handleToken(token)) { + this.emitRaw(this._getCurrentTokenRawHtml()); } // NOTE: don't skip new lines after
 and other tags,
@@ -67,21 +35,11 @@ class RewritingStream extends SAXParser {
         this.parserFeedbackSimulator.skipNextNewLine = false;
     }
 
-    _emitPendingText() {
-        if (this.pendingText !== null) {
-            const raw = this._getCurrentTokenRawHtml();
-
-            if (this.listenerCount('text') > 0) {
-                this.emit('text', this._createTextToken(), raw);
-            } else {
-                this.emitRaw(raw);
-            }
-
-            this.pendingText = null;
-        }
+    // Emitter API
+    _emitToken(eventName, token) {
+        this.emit(eventName, token, this._getCurrentTokenRawHtml());
     }
 
-    // Emitter API
     emitDoctype(token) {
         let res = `
Date: Thu, 9 Aug 2018 18:39:39 +0100
Subject: [PATCH 3/6] Remove currentTokenLocation state

---
 .../parse5-html-rewriting-stream/lib/index.js     | 10 +++++-----
 packages/parse5-sax-parser/lib/index.js           | 15 +++++----------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/packages/parse5-html-rewriting-stream/lib/index.js b/packages/parse5-html-rewriting-stream/lib/index.js
index 3e80a4e5b..3657a4755 100644
--- a/packages/parse5-html-rewriting-stream/lib/index.js
+++ b/packages/parse5-html-rewriting-stream/lib/index.js
@@ -16,10 +16,10 @@ class RewritingStream extends SAXParser {
         callback();
     }
 
-    _getCurrentTokenRawHtml() {
+    _getRawHtml(location) {
         const droppedBufferSize = this.posTracker.droppedBufferSize;
-        const start = this.currentTokenLocation.startOffset - droppedBufferSize;
-        const end = this.currentTokenLocation.endOffset - droppedBufferSize;
+        const start = location.startOffset - droppedBufferSize;
+        const end = location.endOffset - droppedBufferSize;
 
         return this.tokenizer.preprocessor.html.slice(start, end);
     }
@@ -27,7 +27,7 @@ class RewritingStream extends SAXParser {
     // Events
     _handleToken(token) {
         if (!super._handleToken(token)) {
-            this.emitRaw(this._getCurrentTokenRawHtml());
+            this.emitRaw(this._getRawHtml(token.location));
         }
 
         // NOTE: don't skip new lines after 
 and other tags,
@@ -37,7 +37,7 @@ class RewritingStream extends SAXParser {
 
     // Emitter API
     _emitToken(eventName, token) {
-        this.emit(eventName, token, this._getCurrentTokenRawHtml());
+        this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation));
     }
 
     emitDoctype(token) {
diff --git a/packages/parse5-sax-parser/lib/index.js b/packages/parse5-sax-parser/lib/index.js
index e9270eb18..9a91f2b9f 100644
--- a/packages/parse5-sax-parser/lib/index.js
+++ b/packages/parse5-sax-parser/lib/index.js
@@ -28,7 +28,6 @@ class SAXParser extends Transform {
         this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer);
 
         this.pendingText = null;
-        this.currentTokenLocation = void 0;
 
         this.lastChunkWritten = false;
         this.stopped = false;
@@ -112,10 +111,6 @@ class SAXParser extends Transform {
 
         const { eventName, reshapeToken } = TOKEN_EMISSION_HELPERS[token.type];
 
-        if (this.options.sourceCodeLocationInfo) {
-            this.currentTokenLocation = token.location;
-        }
-
         if (this.listenerCount(eventName) === 0) {
             return false;
         }
@@ -142,16 +137,16 @@ class SAXParser extends Transform {
             tagName: origToken.tagName,
             attrs: origToken.attrs,
             selfClosing: origToken.selfClosing,
-            sourceCodeLocation: this.currentTokenLocation
+            sourceCodeLocation: origToken.location
         };
     }
 
     _reshapeEndTagToken(origToken) {
-        return { tagName: origToken.tagName, sourceCodeLocation: this.currentTokenLocation };
+        return { tagName: origToken.tagName, sourceCodeLocation: origToken.location };
     }
 
     _reshapeCommentToken(origToken) {
-        return { text: origToken.data, sourceCodeLocation: this.currentTokenLocation };
+        return { text: origToken.data, sourceCodeLocation: origToken.location };
     }
 
     _reshapeDoctypeToken(origToken) {
@@ -159,12 +154,12 @@ class SAXParser extends Transform {
             name: origToken.name,
             publicId: origToken.publicId,
             systemId: origToken.systemId,
-            sourceCodeLocation: this.currentTokenLocation
+            sourceCodeLocation: origToken.location
         };
     }
 
     _reshapeCharToken(origToken) {
-        return { text: origToken.chars, sourceCodeLocation: this.currentTokenLocation };
+        return { text: origToken.chars, sourceCodeLocation: origToken.location };
     }
 }
 

From a4668c35e2bf246a932668a4163186e5b3fd206e Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan 
Date: Thu, 9 Aug 2018 18:56:31 +0100
Subject: [PATCH 4/6] Unbind reshapeToken helpers

Now that location is retrieved directly from tokens, they no longer need to have access to `this` and can be just static helpers.
---
 packages/parse5-sax-parser/lib/index.js | 53 ++++++++-----------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/packages/parse5-sax-parser/lib/index.js b/packages/parse5-sax-parser/lib/index.js
index 9a91f2b9f..fd1aa41ac 100644
--- a/packages/parse5-sax-parser/lib/index.js
+++ b/packages/parse5-sax-parser/lib/index.js
@@ -115,7 +115,7 @@ class SAXParser extends Transform {
             return false;
         }
 
-        this._emitToken(eventName, reshapeToken.call(this, token));
+        this._emitToken(eventName, reshapeToken(token));
 
         return true;
     }
@@ -130,59 +130,38 @@ class SAXParser extends Transform {
             this.pendingText = null;
         }
     }
-
-    // Tokens
-    _reshapeStartTagToken(origToken) {
-        return {
-            tagName: origToken.tagName,
-            attrs: origToken.attrs,
-            selfClosing: origToken.selfClosing,
-            sourceCodeLocation: origToken.location
-        };
-    }
-
-    _reshapeEndTagToken(origToken) {
-        return { tagName: origToken.tagName, sourceCodeLocation: origToken.location };
-    }
-
-    _reshapeCommentToken(origToken) {
-        return { text: origToken.data, sourceCodeLocation: origToken.location };
-    }
-
-    _reshapeDoctypeToken(origToken) {
-        return {
-            name: origToken.name,
-            publicId: origToken.publicId,
-            systemId: origToken.systemId,
-            sourceCodeLocation: origToken.location
-        };
-    }
-
-    _reshapeCharToken(origToken) {
-        return { text: origToken.chars, sourceCodeLocation: origToken.location };
-    }
 }
 
 const TOKEN_EMISSION_HELPERS = {
     [Tokenizer.START_TAG_TOKEN]: {
         eventName: 'startTag',
-        reshapeToken: SAXParser.prototype._reshapeStartTagToken
+        reshapeToken: origToken => ({
+            tagName: origToken.tagName,
+            attrs: origToken.attrs,
+            selfClosing: origToken.selfClosing,
+            sourceCodeLocation: origToken.location
+        })
     },
     [Tokenizer.END_TAG_TOKEN]: {
         eventName: 'endTag',
-        reshapeToken: SAXParser.prototype._reshapeEndTagToken
+        reshapeToken: origToken => ({ tagName: origToken.tagName, sourceCodeLocation: origToken.location })
     },
     [Tokenizer.COMMENT_TOKEN]: {
         eventName: 'comment',
-        reshapeToken: SAXParser.prototype._reshapeCommentToken
+        reshapeToken: origToken => ({ text: origToken.data, sourceCodeLocation: origToken.location })
     },
     [Tokenizer.DOCTYPE_TOKEN]: {
         eventName: 'doctype',
-        reshapeToken: SAXParser.prototype._reshapeDoctypeToken
+        reshapeToken: origToken => ({
+            name: origToken.name,
+            publicId: origToken.publicId,
+            systemId: origToken.systemId,
+            sourceCodeLocation: origToken.location
+        })
     },
     [Tokenizer.CHARACTER_TOKEN]: {
         eventName: 'text',
-        reshapeToken: SAXParser.prototype._reshapeCharToken
+        reshapeToken: origToken => ({ text: origToken.chars, sourceCodeLocation: origToken.location })
     }
 };
 

From 70afee1c009b07cfe6cd9618c2b6f12de4deec85 Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan 
Date: Fri, 10 Aug 2018 12:11:34 +0100
Subject: [PATCH 5/6] Simplify overriding _transform

---
 packages/parse5-html-rewriting-stream/lib/index.js | 8 ++++----
 packages/parse5-sax-parser/lib/index.js            | 8 +++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/packages/parse5-html-rewriting-stream/lib/index.js b/packages/parse5-html-rewriting-stream/lib/index.js
index 3657a4755..2db2bde1c 100644
--- a/packages/parse5-html-rewriting-stream/lib/index.js
+++ b/packages/parse5-html-rewriting-stream/lib/index.js
@@ -10,10 +10,10 @@ class RewritingStream extends SAXParser {
         this.posTracker = this.locInfoMixin.posTracker;
     }
 
-    _transform(chunk, encoding, callback) {
-        this._parseChunk(chunk);
-
-        callback();
+    _transformChunk(chunk) {
+        // NOTE: ignore upstream return value as we want to push to
+        // the Writable part of Transform stream ourselves.
+        super._transformChunk(chunk);
     }
 
     _getRawHtml(location) {
diff --git a/packages/parse5-sax-parser/lib/index.js b/packages/parse5-sax-parser/lib/index.js
index fd1aa41ac..a22452405 100644
--- a/packages/parse5-sax-parser/lib/index.js
+++ b/packages/parse5-sax-parser/lib/index.js
@@ -40,10 +40,7 @@ class SAXParser extends Transform {
 
     //TransformStream implementation
     _transform(chunk, encoding, callback) {
-        this._parseChunk(chunk);
-        this.push(chunk);
-
-        callback();
+        callback(null, this._transformChunk(chunk));
     }
 
     _flush(callback) {
@@ -60,11 +57,12 @@ class SAXParser extends Transform {
     }
 
     //Internals
-    _parseChunk(chunk) {
+    _transformChunk(chunk) {
         if (!this.stopped) {
             this.tokenizer.write(chunk.toString('utf8'), this.lastChunkWritten);
             this._runParsingLoop();
         }
+        return chunk;
     }
 
     _runParsingLoop() {

From b6af06db125d74a9fed0dbadafc151a97b04e98e Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan 
Date: Fri, 10 Aug 2018 12:12:22 +0100
Subject: [PATCH 6/6] Remove unnecessary _flush override

---
 packages/parse5-sax-parser/lib/index.js | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/packages/parse5-sax-parser/lib/index.js b/packages/parse5-sax-parser/lib/index.js
index a22452405..ba6c5a6bf 100644
--- a/packages/parse5-sax-parser/lib/index.js
+++ b/packages/parse5-sax-parser/lib/index.js
@@ -43,10 +43,6 @@ class SAXParser extends Transform {
         callback(null, this._transformChunk(chunk));
     }
 
-    _flush(callback) {
-        callback();
-    }
-
     end(chunk, encoding, callback) {
         this.lastChunkWritten = true;
         super.end(chunk, encoding, callback);