Skip to content

Commit ed515ef

Browse files
committed
fix: allow hooking into script content
1 parent ba2dded commit ed515ef

File tree

5 files changed

+33
-22
lines changed

5 files changed

+33
-22
lines changed

CLAUDE.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ The plugin system allows you to customize HTML to Markdown conversion by hooking
7070

7171
- `beforeNodeProcess`: Called before any node processing, can skip nodes
7272
- `onNodeEnter`: Called when entering an element node
73-
- `onNodeExit`: Called when exiting an element node
73+
- `onNodeExit`: Called when exiting an element node
7474
- `processTextNode`: Called for each text node
7575
- `processAttributes`: Called to process element attributes
7676

@@ -88,7 +88,7 @@ export function myPlugin() {
8888
return '**Custom content:** '
8989
}
9090
},
91-
91+
9292
processTextNode(textNode) {
9393
// Transform text content
9494
if (textNode.value.includes('TODO')) {
@@ -104,14 +104,14 @@ export function myPlugin() {
104104
```typescript
105105
export function headerExtractPlugin() {
106106
const headers: string[] = []
107-
107+
108108
return createPlugin({
109109
onNodeEnter(element) {
110110
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(element.tagName)) {
111111
// Will collect text in processTextNode
112112
}
113113
},
114-
114+
115115
processTextNode(textNode) {
116116
const parent = textNode.parent
117117
if (parent && parent.tagName?.match(/^h[1-6]$/)) {
@@ -129,13 +129,13 @@ export function adBlockPlugin() {
129129
return createPlugin({
130130
beforeNodeProcess(event) {
131131
const { node } = event
132-
132+
133133
if (node.type === ELEMENT_NODE) {
134134
const element = node as ElementNode
135-
135+
136136
// Skip ads and promotional content
137-
if (element.attributes?.class?.includes('ad') ||
138-
element.attributes?.id?.includes('promo')) {
137+
if (element.attributes?.class?.includes('ad')
138+
|| element.attributes?.id?.includes('promo')) {
139139
return { skip: true }
140140
}
141141
}
@@ -180,4 +180,3 @@ export function adBlockPlugin() {
180180
- Real-world test fixtures in `test/fixtures/` (GitHub, Wikipedia HTML)
181181
- Template tests for complex HTML structures (navigation, tables, etc.)
182182
- Always run tests after making changes to ensure backward compatibility
183-

README.md

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ The plugin system allows you to customize HTML to Markdown conversion by hooking
143143
#### Plugin Hooks
144144

145145
- `beforeNodeProcess`: Called before any node processing, can skip nodes
146-
- `onNodeEnter`: Called when entering an element node
146+
- `onNodeEnter`: Called when entering an element node
147147
- `onNodeExit`: Called when exiting an element node
148148
- `processTextNode`: Called for each text node
149149
- `processAttributes`: Called to process element attributes
@@ -153,23 +153,23 @@ The plugin system allows you to customize HTML to Markdown conversion by hooking
153153
Use `createPlugin()` to create a plugin with type safety:
154154

155155
```ts
156+
import type { ElementNode, TextNode } from 'mdream'
156157
import { htmlToMarkdown } from 'mdream'
157158
import { createPlugin } from 'mdream/plugins'
158-
import type { ElementNode, TextNode } from 'mdream'
159159

160160
const myPlugin = createPlugin({
161161
onNodeEnter(node: ElementNode): string | undefined {
162162
if (node.name === 'h1') {
163163
return '🔥 '
164164
}
165165
},
166-
167-
processTextNode(textNode: TextNode): { content: string; skip: boolean } | undefined {
166+
167+
processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
168168
// Transform text content
169169
if (textNode.parent?.attributes?.id === 'highlight') {
170-
return {
171-
content: `**${textNode.value}**`,
172-
skip: false
170+
return {
171+
content: `**${textNode.value}**`,
172+
skip: false
173173
}
174174
}
175175
}
@@ -183,19 +183,19 @@ const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
183183
#### Example: Content Filter Plugin
184184

185185
```ts
186+
import type { ElementNode, NodeEvent } from 'mdream'
186187
import { ELEMENT_NODE } from 'mdream'
187188
import { createPlugin } from 'mdream/plugins'
188-
import type { NodeEvent, ElementNode } from 'mdream'
189189

190190
const adBlockPlugin = createPlugin({
191191
beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
192192
const { node } = event
193-
193+
194194
if (node.type === ELEMENT_NODE && node.name === 'div') {
195195
const element = node as ElementNode
196196
// Skip ads and promotional content
197-
if (element.attributes?.class?.includes('ad') ||
198-
element.attributes?.id?.includes('promo')) {
197+
if (element.attributes?.class?.includes('ad')
198+
|| element.attributes?.id?.includes('promo')) {
199199
return { skip: true }
200200
}
201201
}
@@ -208,8 +208,8 @@ const adBlockPlugin = createPlugin({
208208
Extract specific elements and their content during HTML processing for data analysis or content discovery:
209209

210210
```ts
211-
import { extractionPlugin, htmlToMarkdown } from 'mdream'
212211
import type { ExtractedElement } from 'mdream/plugins'
212+
import { extractionPlugin, htmlToMarkdown } from 'mdream'
213213

214214
const html: string = `
215215
<article>

src/markdown.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,12 @@ export function processHtmlEventToMarkdown(
9393
textNode.value = pluginResult.content
9494
}
9595
}
96+
97+
// Skip text nodes that are excluded from markdown output (e.g., script/style content)
98+
if (textNode.excludedFromMarkdown) {
99+
return
100+
}
101+
96102
// Skip leading spaces after newlines
97103
if (textNode.value === ' ' && lastChar === '\n') {
98104
return

src/parser.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,13 @@ function processTextBuffer(textBuffer: string, state: MdreamProcessingState, han
280280
const containsWhitespace = state.textBufferContainsWhitespace
281281
state.textBufferContainsNonWhitespace = false
282282
state.textBufferContainsWhitespace = false
283-
if (!state.currentNode || state.currentNode?.tagHandler?.excludesTextNodes) {
283+
if (!state.currentNode) {
284284
return
285285
}
286286

287+
// // Check if this element excludes text nodes from markdown output
288+
const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes
289+
287290
// Check if parent is a <pre> tag to handle whitespace properly
288291
const inPreTag = state.depthMap[TAG_PRE] > 0
289292

@@ -325,6 +328,7 @@ function processTextBuffer(textBuffer: string, state: MdreamProcessingState, han
325328
index: state.currentNode.currentWalkIndex!++,
326329
depth: state.depth,
327330
containsWhitespace,
331+
excludedFromMarkdown: excludesTextNodes,
328332
}
329333

330334
for (const parent of parentsToIncrement) {

src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ export interface TextNode extends Node {
9393
value: string
9494
/** Custom data added by plugins */
9595
context?: Record<string, any>
96+
/** Whether this text node should be excluded from markdown output (for script/style elements) */
97+
excludedFromMarkdown?: boolean
9698
}
9799

98100
/**

0 commit comments

Comments
 (0)