Skip to content

Commit 0780418

Browse files
committed
fix: better spacing
1 parent b9c2637 commit 0780418

File tree

15 files changed

+250
-175
lines changed

15 files changed

+250
-175
lines changed

scripts/crawl.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ const crawler = new PlaywrightCrawler({
1313
const html = await page.innerHTML('html')
1414
log.info('HTML length', { url: request.loadedUrl, length: html.length })
1515
const now = new Date()
16-
const md = syncHtmlToMarkdown(html, withMinimalPreset())
16+
const md = syncHtmlToMarkdown(html, withMinimalPreset({
17+
origin: new URL(request.loadedUrl).origin,
18+
}))
1719
log.info('Processed html -> md in', { url: request.loadedUrl, time: new Date() - now })
1820
// mkdir
1921
if (!existsSync('./output')) {

src/buffer-region.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ export function assembleBufferedContent(
8585
const fragments: string[] = []
8686

8787
// Then process all other regions (excluding frontmatter)
88-
for (const [regionId, content] of state.regionContentBuffers) {
88+
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) {
8989
// Check if region should be included
9090
const include = state.regionToggles.get(regionId)
9191
if (include) {

src/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ export function syncHtmlToMarkdown(
66
options: HTMLToMarkdownOptions = {},
77
): string {
88
// Initialize state
9-
const state: MdreamRuntimeState = {
9+
const state = {
1010
options,
11-
}
11+
} as MdreamRuntimeState
1212
const result = processPartialHTMLToMarkdown(html, state).chunk
1313
return result.trimEnd()
1414
}

src/markdown.ts

Lines changed: 93 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,27 @@
11
import type { ElementNode, HandlerContext, MdreamRuntimeState, NodeEvent, TextNode } from './types'
2+
3+
/**
4+
* Determines if spacing is needed between two characters
5+
*/
6+
function needsSpacing(lastChar: string, firstChar: string): boolean {
7+
const noSpaceLastChars = new Set(['\n', ' ', '[', '>', '_', '*', '`', '|', '#', '<', '('])
8+
const noSpaceFirstChars = new Set([' ', '\n', '\t', '_', '*', '`', '|', '>', '#'])
9+
10+
return !noSpaceLastChars.has(lastChar) && !noSpaceFirstChars.has(firstChar)
11+
}
12+
13+
/**
14+
* Determines if spacing should be added before text content
15+
*/
16+
function shouldAddSpacingBeforeText(lastChar: string, lastNode: any, textNode: TextNode): boolean {
17+
return lastChar &&
18+
lastChar !== '\n' &&
19+
lastChar !== ' ' &&
20+
lastChar !== '[' &&
21+
lastChar !== '>' &&
22+
!lastNode?.tagHandler?.isInline &&
23+
textNode.value[0] !== ' '
24+
}
225
import { collectNodeContent } from './buffer-region'
326
import {
427
DEFAULT_BLOCK_SPACING,
@@ -41,11 +64,23 @@ export function processHtmlEventToMarkdown(
4164
state: MdreamRuntimeState,
4265
): void {
4366
const { type: eventType, node } = event
44-
67+
const lastNode = state.lastNode
68+
state.lastNode = event.node
69+
const buff = state.regionContentBuffers.get(node.regionId || 0) || []
70+
const lastBuffEntry = buff[buff.length - 1]
71+
const lastChar = lastBuffEntry?.charAt(lastBuffEntry.length - 1) || ''
72+
// we need to see if it exists within buff[lastIndex] or buff[lastIndex - 1]
73+
let secondLastChar
74+
if (lastBuffEntry?.length > 1) {
75+
secondLastChar = lastBuffEntry.charAt(lastBuffEntry.length - 2)
76+
}
77+
else {
78+
secondLastChar = buff[buff.length - 2]?.charAt(buff[buff.length - 2].length - 1)
79+
}
4580
// Handle text nodes
4681
if (node.type === TEXT_NODE && eventType === NodeEventEnter) {
4782
const textNode = node as TextNode
48-
state.lastNewLines = 0
83+
4984
if (textNode.value) {
5085
// Process text node with plugins
5186
if (state.plugins?.length) {
@@ -58,6 +93,16 @@ export function processHtmlEventToMarkdown(
5893
textNode.value = pluginResult.content
5994
}
6095
}
96+
// Skip leading spaces after newlines
97+
if (textNode.value === ' ' && lastChar === '\n') {
98+
return
99+
}
100+
101+
// Add spacing before text if needed
102+
if (shouldAddSpacingBeforeText(lastChar, lastNode, textNode)) {
103+
textNode.value = ` ${textNode.value}`
104+
}
105+
61106
collectNodeContent(textNode, textNode.value, state)
62107
}
63108
state.lastTextNode = textNode
@@ -89,6 +134,14 @@ export function processHtmlEventToMarkdown(
89134
output.push(...results)
90135
}
91136

137+
let lastNewLines = 0
138+
if (lastChar === '\n') {
139+
lastNewLines++
140+
}
141+
if (secondLastChar === '\n') {
142+
lastNewLines++
143+
}
144+
92145
const eventFn = eventType === NodeEventEnter ? 'enter' : 'exit'
93146
// Use the cached tag handler directly from the node
94147
const handler = node.tagHandler
@@ -99,82 +152,58 @@ export function processHtmlEventToMarkdown(
99152
}
100153
}
101154

102-
// Trim trailing whitespace from the last text node
103-
if (!state.lastNewLines && lastFragment && state.lastTextNode?.containsWhitespace && !!node.parent && 'value' in state.lastTextNode && typeof state.lastTextNode.value === 'string') {
104-
if (!node.parent.depthMap[TAG_PRE] || node.parent.tagId === TAG_PRE) {
105-
const originalLength = lastFragment.length
106-
const trimmed = lastFragment.trimEnd()
107-
const trimmedChars = originalLength - trimmed.length
108-
109-
// Update the last content in buffer regions with trimmed content
110-
if (trimmedChars > 0) {
111-
for (const buffer of Array.from(state.regionContentBuffers.values())) {
112-
if (buffer.length > 0 && buffer[buffer.length - 1] === lastFragment) {
113-
buffer[buffer.length - 1] = trimmed
114-
break
115-
}
116-
}
117-
}
118-
119-
state.lastTextNode = undefined
120-
}
121-
}
122-
123155
// Handle newlines
124156
const newLineConfig = calculateNewLineConfig(node as ElementNode)
125-
let newLines = newLineConfig[eventType] || 0
157+
const newLines = Math.max(0, (newLineConfig[eventType] || 0) - lastNewLines)
126158

127159
if (newLines > 0) {
128-
// Initialize lastNewLines if undefined
129-
state.lastNewLines ??= 0
130-
131-
// Adjust count based on existing newlines
132-
newLines = Math.max(0, newLines - state.lastNewLines)
160+
// if the region has no content, add the current content (without new lines)
161+
if (!buff.length) {
162+
for (const fragment of output) {
163+
collectNodeContent(node, fragment, state)
164+
}
165+
return
166+
}
133167

134-
// Handle enter events with content
135-
if (eventType === NodeEventEnter && output.length) {
136-
state.lastNewLines = 0
168+
// Add newlines
169+
const newlinesStr = '\n'.repeat(newLines)
170+
// trim only whitespace
171+
if (lastChar === ' ' && buff?.length) {
172+
buff[buff.length - 1] = buff[buff.length - 1].substring(0, buff[buff.length - 1].length - 1)
137173
}
138-
if (newLines > 0) {
139-
if (!state.regionContentBuffers.get(event.node.regionId || 0)?.length) {
140-
for (const fragment of output) {
141-
collectNodeContent(node, fragment, state)
142-
}
143-
return
144-
}
145-
// Update state for non-enter events
146-
if (eventType !== NodeEventEnter || !output.length) {
147-
state.lastNewLines = newLines
148-
}
149174

150-
// Add newlines
151-
const newlinesStr = '\n'.repeat(newLines)
152-
// trim only whitespace
153-
if (lastFragment && typeof lastFragment === 'string' && lastFragment.length > 0) {
154-
const lastChar = lastFragment.charAt(lastFragment.length - 1)
155-
if (lastChar === ' ') {
156-
// Update the last content in buffer regions with trimmed content
157-
for (const buffer of Array.from(state.regionContentBuffers.values())) {
158-
if (buffer.length > 0 && buffer[buffer.length - 1] === lastFragment) {
159-
buffer[buffer.length - 1] = lastFragment.substring(0, lastFragment.length - 1)
160-
break
161-
}
175+
if (eventType === NodeEventEnter) {
176+
output.unshift(newlinesStr)
177+
}
178+
else {
179+
output.push(newlinesStr)
180+
}
181+
}
182+
else {
183+
// trim whitespaced between inline output
184+
// Trim trailing whitespace from the last text node
185+
if (lastFragment && state.lastTextNode?.containsWhitespace && !!node.parent && 'value' in state.lastTextNode && typeof state.lastTextNode.value === 'string') {
186+
if (!node.parent.depthMap[TAG_PRE] || node.parent.tagId === TAG_PRE) {
187+
const originalLength = lastFragment.length
188+
const trimmed = lastFragment.trimEnd()
189+
const trimmedChars = originalLength - trimmed.length
190+
191+
// Update the last content in buffer regions with trimmed content
192+
if (trimmedChars > 0) {
193+
if (buff?.length && buff[buff.length - 1] === lastFragment) {
194+
buff[buff.length - 1] = trimmed
162195
}
163196
}
164-
}
165197

166-
if (eventType === NodeEventEnter) {
167-
output.unshift(newlinesStr)
168-
}
169-
else {
170-
output.push(newlinesStr)
198+
state.lastTextNode = undefined
171199
}
172200
}
173201
}
174-
else {
175-
state.lastNewLines = 0
176-
}
177202

203+
// Add spacing between inline elements if needed
204+
if (output[0]?.[0] && eventType === NodeEventEnter && lastChar && needsSpacing(lastChar, output[0][0])) {
205+
collectNodeContent(node, ' ', state)
206+
}
178207
// Calculate total length of output fragments before adding to the main fragments
179208
for (const fragment of output) {
180209
collectNodeContent(node, fragment, state)

0 commit comments

Comments
 (0)