Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor emission logic of SAXParser and RewritingStream #268

Merged
merged 6 commits into from Aug 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 12 additions & 54 deletions packages/parse5-html-rewriting-stream/lib/index.js
@@ -1,87 +1,45 @@
'use strict';

const SAXParser = require('parse5-sax-parser');
const Tokenizer = require('parse5/lib/tokenizer');
const { escapeString } = require('parse5/lib/serializer');

class RewritingStream extends SAXParser {
constructor() {
super({ sourceCodeLocationInfo: true });

this.posTracker = this.locInfoMixin.posTracker;

this.tokenEmissionHelpers = {
[Tokenizer.START_TAG_TOKEN]: {
eventName: 'startTag',
reshapeToken: token => this._reshapeStartTagToken(token)
},
[Tokenizer.END_TAG_TOKEN]: {
eventName: 'endTag',
reshapeToken: token => this._reshapeEndTagToken(token)
},
[Tokenizer.COMMENT_TOKEN]: {
eventName: 'comment',
reshapeToken: token => this._reshapeCommentToken(token)
},
[Tokenizer.DOCTYPE_TOKEN]: {
eventName: 'doctype',
reshapeToken: token => this._reshapeDoctypeToken(token)
}
};
}

_transform(chunk, encoding, callback) {
this._parseChunk(chunk);

callback();
_transformChunk(chunk) {
// NOTE: ignore upstream return value as we want to push to
// the Writable part of Transform stream ourselves.
super._transformChunk(chunk);
}

_getCurrentTokenRawHtml() {
_getRawHtml(location) {
const droppedBufferSize = this.posTracker.droppedBufferSize;
const start = this.currentTokenLocation.startOffset - droppedBufferSize;
const end = this.currentTokenLocation.endOffset - droppedBufferSize;
const start = location.startOffset - droppedBufferSize;
const end = location.endOffset - droppedBufferSize;

return this.tokenizer.preprocessor.html.slice(start, end);
}

// Events
_handleToken(token) {
if (token.type === Tokenizer.EOF_TOKEN) {
return;
}

const { eventName, reshapeToken } = this.tokenEmissionHelpers[token.type];

this.currentTokenLocation = token.location;

const raw = this._getCurrentTokenRawHtml();

if (this.listenerCount(eventName) > 0) {
this.emit(eventName, reshapeToken(token), raw);
} else {
this.emitRaw(raw);
if (!super._handleToken(token)) {
this.emitRaw(this._getRawHtml(token.location));
}

// NOTE: don't skip new lines after <pre> and other tags,
// otherwise we'll have incorrect raw data.
this.parserFeedbackSimulator.skipNextNewLine = false;
}

_emitPendingText() {
if (this.pendingText !== null) {
const raw = this._getCurrentTokenRawHtml();

if (this.listenerCount('text') > 0) {
this.emit('text', this._createTextToken(), raw);
} else {
this.emitRaw(raw);
}

this.pendingText = null;
}
// Emitter API
_emitToken(eventName, token) {
this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation));
}

// Emitter API
emitDoctype(token) {
let res = `<!DOCTYPE ${token.name}`;

Expand Down
109 changes: 57 additions & 52 deletions packages/parse5-sax-parser/lib/index.js
Expand Up @@ -28,7 +28,6 @@ class SAXParser extends Transform {
this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer);

this.pendingText = null;
this.currentTokenLocation = void 0;

this.lastChunkWritten = false;
this.stopped = false;
Expand All @@ -41,14 +40,7 @@ class SAXParser extends Transform {

//TransformStream implementation
_transform(chunk, encoding, callback) {
this._parseChunk(chunk);
this.push(chunk);

callback();
}

_flush(callback) {
callback();
callback(null, this._transformChunk(chunk));
}

end(chunk, encoding, callback) {
Expand All @@ -61,11 +53,12 @@ class SAXParser extends Transform {
}

//Internals
_parseChunk(chunk) {
_transformChunk(chunk) {
if (!this.stopped) {
this.tokenizer.write(chunk.toString('utf8'), this.lastChunkWritten);
this._runParsingLoop();
}
return chunk;
}

_runParsingLoop() {
Expand All @@ -83,15 +76,21 @@ class SAXParser extends Transform {
token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN ||
token.type === Tokenizer.NULL_CHARACTER_TOKEN
) {
if (this.options.sourceCodeLocationInfo) {
if (this.pendingText === null) {
this.currentTokenLocation = token.location;
} else {
this.currentTokenLocation.endOffset = token.location.endOffset;
if (this.pendingText === null) {
token.type = Tokenizer.CHARACTER_TOKEN;
this.pendingText = token;
} else {
this.pendingText.chars += token.chars;

if (this.options.sourceCodeLocationInfo) {
const { endLine, endCol, endOffset } = token.location;
Object.assign(this.pendingText.location, {
endLine,
endCol,
endOffset
});
}
}

this.pendingText = (this.pendingText || '') + token.chars;
} else {
this._emitPendingText();
this._handleToken(token);
Expand All @@ -100,58 +99,64 @@ class SAXParser extends Transform {
}

_handleToken(token) {
if (this.options.sourceCodeLocationInfo) {
this.currentTokenLocation = token.location;
if (token.type === Tokenizer.EOF_TOKEN) {
return true;
}

if (token.type === Tokenizer.START_TAG_TOKEN) {
this.emit('startTag', this._reshapeStartTagToken(token));
} else if (token.type === Tokenizer.END_TAG_TOKEN) {
this.emit('endTag', this._reshapeEndTagToken(token));
} else if (token.type === Tokenizer.COMMENT_TOKEN) {
this.emit('comment', this._reshapeCommentToken(token));
} else if (token.type === Tokenizer.DOCTYPE_TOKEN) {
this.emit('doctype', this._reshapeDoctypeToken(token));
const { eventName, reshapeToken } = TOKEN_EMISSION_HELPERS[token.type];

if (this.listenerCount(eventName) === 0) {
return false;
}

this._emitToken(eventName, reshapeToken(token));

return true;
}

_emitToken(eventName, token) {
this.emit(eventName, token);
}

_emitPendingText() {
if (this.pendingText !== null) {
this.emit('text', this._createTextToken());
this._handleToken(this.pendingText);
this.pendingText = null;
}
}
}

// Tokens
_createTextToken() {
return { text: this.pendingText, sourceCodeLocation: this.currentTokenLocation };
}

_reshapeStartTagToken(origToken) {
return {
const TOKEN_EMISSION_HELPERS = {
[Tokenizer.START_TAG_TOKEN]: {
eventName: 'startTag',
reshapeToken: origToken => ({
tagName: origToken.tagName,
attrs: origToken.attrs,
selfClosing: origToken.selfClosing,
sourceCodeLocation: this.currentTokenLocation
};
}

_reshapeEndTagToken(origToken) {
return { tagName: origToken.tagName, sourceCodeLocation: this.currentTokenLocation };
}

_reshapeCommentToken(origToken) {
return { text: origToken.data, sourceCodeLocation: this.currentTokenLocation };
}

_reshapeDoctypeToken(origToken) {
return {
sourceCodeLocation: origToken.location
})
},
[Tokenizer.END_TAG_TOKEN]: {
eventName: 'endTag',
reshapeToken: origToken => ({ tagName: origToken.tagName, sourceCodeLocation: origToken.location })
},
[Tokenizer.COMMENT_TOKEN]: {
eventName: 'comment',
reshapeToken: origToken => ({ text: origToken.data, sourceCodeLocation: origToken.location })
},
[Tokenizer.DOCTYPE_TOKEN]: {
eventName: 'doctype',
reshapeToken: origToken => ({
name: origToken.name,
publicId: origToken.publicId,
systemId: origToken.systemId,
sourceCodeLocation: this.currentTokenLocation
};
sourceCodeLocation: origToken.location
})
},
[Tokenizer.CHARACTER_TOKEN]: {
eventName: 'text',
reshapeToken: origToken => ({ text: origToken.chars, sourceCodeLocation: origToken.location })
}
}
};

module.exports = SAXParser;
14 changes: 9 additions & 5 deletions packages/parse5-sax-parser/test/location-info.test.js
Expand Up @@ -29,16 +29,20 @@ exports['Location info (SAX)'] = function() {
});
};

exports['Regression - location info for text (GH-153)'] = function() {
exports['Regression - location info for text (GH-153, GH-266)'] = function() {
const html = '<!DOCTYPE html><html><head><title>Here is a title</title></html>';
const parser = new SAXParser({ sourceCodeLocationInfo: true });
const texts = [];

parser.on('text', ({ sourceCodeLocation }) => {
texts.push(html.substring(sourceCodeLocation.startOffset, sourceCodeLocation.endOffset));
assert.deepStrictEqual(sourceCodeLocation, {
startLine: 1,
startCol: 35,
startOffset: 34,
endLine: 1,
endCol: 50,
endOffset: 49
});
});

parser.end(html);

assert.deepEqual(texts, ['Here is a title']);
};