Skip to content

Commit 857938d

Browse files
committed
fix(mdream): avoid chunk splitting words
1 parent 7d21dad commit 857938d

File tree

2 files changed

+373
-9
lines changed

2 files changed

+373
-9
lines changed

packages/mdream/src/splitter.ts

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,18 @@ export function htmlToMarkdownSplitChunks(
102102
let currentHeaderText = ''
103103
let lineNumber = 1
104104
let lastChunkEndPosition = 0
105+
let lastSplitPosition = 0 // Track where we last split to avoid re-splitting
105106

106-
function flushChunk() {
107+
function flushChunk(endPosition?: number) {
107108
const currentMd = getCurrentMarkdown(processor.state)
108-
const chunkContent = currentMd.slice(lastChunkEndPosition)
109+
const chunkEnd = endPosition ?? currentMd.length
110+
const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd)
109111

110-
if (!chunkContent.trim())
112+
if (!chunkContent.trim()) {
113+
// Still update position to avoid infinite loop
114+
lastChunkEndPosition = chunkEnd
111115
return
116+
}
112117

113118
const chunk: MarkdownChunk = {
114119
content: chunkContent.trimEnd(),
@@ -141,13 +146,18 @@ export function htmlToMarkdownSplitChunks(
141146
// Reset code language for next chunk
142147
currentChunkCodeLanguage = ''
143148

144-
// Handle overlap
149+
// Track where we split (before applying overlap)
150+
lastSplitPosition = chunkEnd
151+
152+
// Handle overlap - ensure we always advance by at least 1 char
145153
if (opts.chunkOverlap > 0) {
146-
const overlapText = chunkContent.slice(-opts.chunkOverlap)
147-
lastChunkEndPosition = currentMd.length - overlapText.length
154+
// Cap overlap to (chunkContent.length - 1) to ensure forward progress
155+
const maxOverlap = Math.max(0, chunkContent.length - 1)
156+
const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap)
157+
lastChunkEndPosition = chunkEnd - actualOverlap
148158
}
149159
else {
150-
lastChunkEndPosition = currentMd.length
160+
lastChunkEndPosition = chunkEnd
151161
}
152162

153163
lineNumber += (chunkContent.match(/\n/g) || []).length
@@ -226,7 +236,30 @@ export function htmlToMarkdownSplitChunks(
226236
const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition))
227237

228238
if (currentChunkSize > opts.chunkSize) {
229-
flushChunk()
239+
// Find optimal split point using hierarchy of separators (like RecursiveCharacterTextSplitter)
240+
const idealSplitPos = lastChunkEndPosition + opts.chunkSize
241+
242+
// Ordered by preference: paragraph > code block > line > word
243+
const separators = ['\n\n', '```\n', '\n', ' ']
244+
let splitPosition = -1
245+
246+
for (const sep of separators) {
247+
// Find last occurrence of separator before/at ideal position
248+
const idx = currentMd.lastIndexOf(sep, idealSplitPos)
249+
const candidateSplitPos = idx + sep.length
250+
// Only use separator if split position would be beyond our last split
251+
if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
252+
splitPosition = candidateSplitPos
253+
break
254+
}
255+
}
256+
257+
// If no separator found before ideal position, use current length (split now)
258+
if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) {
259+
splitPosition = currentMd.length
260+
}
261+
262+
flushChunk(splitPosition)
230263
}
231264
}
232265
})
@@ -275,7 +308,8 @@ export function htmlToMarkdownSplitChunks(
275308
}
276309
}
277310

278-
return chunks
311+
// Filter out empty chunks (can happen after header stripping)
312+
return chunks.filter(chunk => chunk.content.length > 0)
279313
}
280314

281315
export type { MarkdownChunk, SplitterOptions } from './types'

0 commit comments

Comments
 (0)