Skip to content

Commit d1fa990

Browse files
committed
fix(mdream): broken chunking overlap apply
1 parent 38d524f commit d1fa990

File tree

3 files changed

+551
-134
lines changed

3 files changed

+551
-134
lines changed

packages/mdream/src/splitter.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ export function htmlToMarkdownSplitChunks(
104104
let lastChunkEndPosition = 0
105105
let lastSplitPosition = 0 // Track where we last split to avoid re-splitting
106106

107-
function flushChunk(endPosition?: number) {
107+
function flushChunk(endPosition?: number, applyOverlap = false) {
108108
const currentMd = getCurrentMarkdown(processor.state)
109109
const chunkEnd = endPosition ?? currentMd.length
110110
const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd)
@@ -149,8 +149,8 @@ export function htmlToMarkdownSplitChunks(
149149
// Track where we split (before applying overlap)
150150
lastSplitPosition = chunkEnd
151151

152-
// Handle overlap - ensure we always advance by at least 1 char
153-
if (opts.chunkOverlap > 0) {
152+
// Handle overlap - only for size-based splits, not structural splits
153+
if (applyOverlap && opts.chunkOverlap > 0) {
154154
// Cap overlap to (chunkContent.length - 1) to ensure forward progress
155155
const maxOverlap = Math.max(0, chunkContent.length - 1)
156156
const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap)
@@ -276,7 +276,7 @@ export function htmlToMarkdownSplitChunks(
276276
splitPosition = currentMd.length
277277
}
278278

279-
flushChunk(splitPosition)
279+
flushChunk(splitPosition, true)
280280
}
281281
}
282282
})

0 commit comments

Comments
 (0)