Skip to content

Commit 1695e10

Browse files
committed
fix: properly handle quotes
1 parent aa9305f commit 1695e10

File tree

5 files changed

+663
-7
lines changed

5 files changed

+663
-7
lines changed

src/parser.ts

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ const SPACE_CHAR = 32 // ' '
3131
const TAB_CHAR = 9 // '\t'
3232
const NEWLINE_CHAR = 10 // '\n'
3333
const CARRIAGE_RETURN_CHAR = 13 // '\r'
34+
const BACKTICK_CHAR = 96 // '`'
35+
const PIPE_CHAR = 124 // '|'
36+
const OPEN_BRACKET_CHAR = 91 // '['
37+
const CLOSE_BRACKET_CHAR = 93 // ']'
3438

3539
// Pre-allocate arrays and objects to reduce allocations
3640
const EMPTY_ATTRIBUTES: Record<string, string> = Object.freeze({})
@@ -79,6 +83,7 @@ export function parseHTML(htmlChunk: string, state: MdreamProcessingState, handl
7983
state.lastCharWasWhitespace ??= true // don't allow subsequent whitespace at start
8084
state.justClosedTag ??= false
8185
state.isFirstTextInElement ??= false
86+
state.lastCharWasBackslash ??= false
8287

8388
// Process chunk character by character
8489
let i = 0
@@ -121,39 +126,59 @@ export function parseHTML(htmlChunk: string, state: MdreamProcessingState, handl
121126
}
122127
state.lastCharWasWhitespace = true
123128
state.textBufferContainsWhitespace = true
129+
// Whitespace characters reset backslash state
130+
state.lastCharWasBackslash = false
124131
}
125132
else {
126133
state.textBufferContainsNonWhitespace = true
127134
state.lastCharWasWhitespace = false
128135
state.justClosedTag = false
129136
// pipe character
130-
if (currentCharCode === 124 && state.depthMap[TAG_TABLE]) {
137+
if (currentCharCode === PIPE_CHAR && state.depthMap[TAG_TABLE]) {
131138
// replace with encoded pipe character
132139
textBuffer += '\\|'
133140
}
134141
// if in code block we need to encode `
135-
else if (currentCharCode === 96 && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
142+
else if (currentCharCode === BACKTICK_CHAR && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
136143
// replace with encoded `
137144
textBuffer += '\\`'
138145
}
139146
// link open
140-
else if (currentCharCode === 91 && state.depthMap[TAG_A]) {
147+
else if (currentCharCode === OPEN_BRACKET_CHAR && state.depthMap[TAG_A]) {
141148
// replace with encoded [
142149
textBuffer += '\\['
143150
}
144151
// link close
145-
else if (currentCharCode === 93 && state.depthMap[TAG_A]) {
152+
else if (currentCharCode === CLOSE_BRACKET_CHAR && state.depthMap[TAG_A]) {
146153
// replace with encoded ]
147154
textBuffer += '\\]'
148155
}
149156
// blockquote
150-
else if (currentCharCode === 62 && state.depthMap[TAG_BLOCKQUOTE]) {
157+
else if (currentCharCode === GT_CHAR && state.depthMap[TAG_BLOCKQUOTE]) {
151158
// replace with encoded >
152159
textBuffer += '\\>'
153160
}
154161
else {
155162
textBuffer += htmlChunk[i]
156163
}
164+
165+
// Track quote state for non-nesting tags (script/style) - inline for performance
166+
if (state.currentNode?.tagHandler?.isNonNesting) {
167+
// Handle backslash escaping using state flag
168+
if (!state.lastCharWasBackslash) {
169+
// Toggle quote states
170+
if (currentCharCode === APOS_CHAR && !state.inDoubleQuote && !state.inBacktick) {
171+
state.inSingleQuote = !state.inSingleQuote
172+
} else if (currentCharCode === QUOTE_CHAR && !state.inSingleQuote && !state.inBacktick) {
173+
state.inDoubleQuote = !state.inDoubleQuote
174+
} else if (currentCharCode === BACKTICK_CHAR && !state.inSingleQuote && !state.inDoubleQuote) {
175+
state.inBacktick = !state.inBacktick
176+
}
177+
}
178+
}
179+
180+
// Update backslash state for next character
181+
state.lastCharWasBackslash = currentCharCode === BACKSLASH_CHAR
157182
}
158183
i++
159184
continue
@@ -188,6 +213,15 @@ export function parseHTML(htmlChunk: string, state: MdreamProcessingState, handl
188213
}
189214
// CLOSING TAG
190215
else if (nextCharCode === SLASH_CHAR) {
216+
// Check if we're inside quotes within a non-nesting tag
217+
const inQuotes = state.inSingleQuote || state.inDoubleQuote || state.inBacktick
218+
if (state.currentNode?.tagHandler?.isNonNesting && inQuotes) {
219+
// Inside quotes, treat this '<' as regular text and continue
220+
textBuffer += htmlChunk[i]
221+
i++
222+
continue
223+
}
224+
191225
// Process any text content before this tag
192226
if (textBuffer.length > 0) {
193227
processTextBuffer(textBuffer, state, handleEvent)
@@ -307,7 +341,7 @@ function processTextBuffer(textBuffer: string, state: MdreamProcessingState, han
307341
if (containsWhitespace && !firstBlockParent?.childTextNodeIndex) {
308342
// Trim leading whitespace if this is the first text node after an opening tag
309343
let start = 0
310-
while (start < text.length && (inPreTag ? (text.charCodeAt(start) === 10 || text.charCodeAt(start) === 13) : isWhitespace(text.charCodeAt(start)))) {
344+
while (start < text.length && (inPreTag ? (text.charCodeAt(start) === NEWLINE_CHAR || text.charCodeAt(start) === CARRIAGE_RETURN_CHAR) : isWhitespace(text.charCodeAt(start)))) {
311345
start++
312346
}
313347
if (start > 0) {
@@ -405,7 +439,7 @@ function processClosingTag(
405439
// Process the closing tag
406440
if (curr) {
407441
// we need to close all of the parent nodes we walked
408-
closeNode(state.currentNode, state, handleEvent)
442+
closeNode(curr, state, handleEvent)
409443
}
410444

411445
state.justClosedTag = true // Mark that we just processed a closing tag
@@ -453,6 +487,15 @@ function closeNode(node: ElementNode | null, state: MdreamProcessingState, handl
453487
state.depthMap[node.tagId] = Math.max(0, state.depthMap[node.tagId] - 1)
454488
}
455489

490+
// Clear non-nesting tag content tracking when closing non-nesting tags
491+
if (node.tagHandler?.isNonNesting) {
492+
// Reset quote state tracking
493+
state.inSingleQuote = false
494+
state.inDoubleQuote = false
495+
state.inBacktick = false
496+
state.lastCharWasBackslash = false
497+
}
498+
456499
// Depth handling now managed by plugins
457500

458501
state.depth--
@@ -610,6 +653,15 @@ function processOpeningTag(
610653
state.currentNode = parentNode
611654
state.hasEncodedHtmlEntity = false
612655

656+
// Track content start position for non-nesting tags (script/style)
657+
if (tagHandler?.isNonNesting && !result.selfClosing) {
658+
// Initialize quote state tracking
659+
state.inSingleQuote = false
660+
state.inDoubleQuote = false
661+
state.inBacktick = false
662+
state.lastCharWasBackslash = false
663+
}
664+
613665
if (result.selfClosing) {
614666
closeNode(tag, state, handleEvent)
615667
state.justClosedTag = true

src/types.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,14 @@ export interface MdreamProcessingState {
183183
/** Reference to the last processed text node - for context tracking */
184184
lastTextNode?: Node
185185

186+
/** Quote state tracking for non-nesting tags - avoids backward scanning */
187+
inSingleQuote?: boolean
188+
inDoubleQuote?: boolean
189+
inBacktick?: boolean
190+
191+
/** Backslash escaping state tracking - avoids checking previous character */
192+
lastCharWasBackslash?: boolean
193+
186194
/** Plugin instances array for efficient iteration */
187195
plugins?: Plugin[]
188196

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { htmlToMarkdown } from '../../../src/index.ts'
3+
4+
describe('quote handling in script/style tags', () => {
5+
it('should not close script tag when closing tag is inside double quotes', () => {
6+
const html = `<script>
7+
const html = "<script>alert('test')</script>";
8+
console.log(html);
9+
</script>
10+
<p>This should be rendered</p>`
11+
12+
const result = htmlToMarkdown(html)
13+
expect(result).toBe('This should be rendered')
14+
})
15+
16+
it('should not close script tag when closing tag is inside single quotes', () => {
17+
const html = `<script>
18+
const html = '<script>alert("test")</script>';
19+
console.log(html);
20+
</script>
21+
<p>This should be rendered</p>`
22+
23+
const result = htmlToMarkdown(html)
24+
expect(result).toBe('This should be rendered')
25+
})
26+
27+
it('should not close script tag when closing tag is inside backticks', () => {
28+
const html = `<script>
29+
const template = \`<script>alert("test")</script>\`;
30+
console.log(template);
31+
</script>
32+
<p>This should be rendered</p>`
33+
34+
const result = htmlToMarkdown(html)
35+
expect(result).toBe('This should be rendered')
36+
})
37+
38+
it('should handle escaped quotes properly', () => {
39+
const html = `<script>
40+
const html = "He said \\"<script>alert('test')</script>\\" to me";
41+
console.log(html);
42+
</script>
43+
<p>This should be rendered</p>`
44+
45+
const result = htmlToMarkdown(html)
46+
expect(result).toBe('This should be rendered')
47+
})
48+
49+
it('should handle complex JSON with nested quotes in script tag', () => {
50+
const html = `<script type="application/json">
51+
{"message": "He said \\"<script>alert('test')</script>\\" to me"}
52+
</script>
53+
<p>This should be rendered</p>`
54+
55+
const result = htmlToMarkdown(html)
56+
expect(result).toBe('This should be rendered')
57+
})
58+
59+
it('should properly close script tag when quotes are balanced', () => {
60+
const html = `<script>
61+
const message = "Hello world";
62+
console.log(message);
63+
</script>
64+
<p>This should be rendered</p>`
65+
66+
const result = htmlToMarkdown(html)
67+
expect(result).toBe('This should be rendered')
68+
})
69+
70+
it('should handle mixed quote types correctly', () => {
71+
const html = `<script>
72+
const outer = "He said 'hello' to me";
73+
const inner = 'She replied "goodbye" back';
74+
</script>
75+
<p>This should be rendered</p>`
76+
77+
const result = htmlToMarkdown(html)
78+
expect(result).toBe('This should be rendered')
79+
})
80+
81+
it('should handle style tags with quotes in CSS content', () => {
82+
const html = `<style>
83+
.class:before { content: "</style>"; }
84+
.other { color: red; }
85+
</style>
86+
<p>This should be rendered</p>`
87+
88+
const result = htmlToMarkdown(html)
89+
expect(result).toBe('This should be rendered')
90+
})
91+
92+
it('should handle empty quotes', () => {
93+
const html = `<script>
94+
const empty = "";
95+
const alsoempty = '';
96+
</script>
97+
<p>This should be rendered</p>`
98+
99+
const result = htmlToMarkdown(html)
100+
expect(result).toBe('This should be rendered')
101+
})
102+
103+
it('should handle multiline strings with closing tags', () => {
104+
const html = `<script>
105+
const multiline = \`
106+
<script>
107+
alert('nested');
108+
</script>
109+
\`;
110+
</script>
111+
<p>This should be rendered</p>`
112+
113+
const result = htmlToMarkdown(html)
114+
expect(result).toBe('This should be rendered')
115+
})
116+
})

0 commit comments

Comments
 (0)