@@ -102,13 +102,18 @@ export function htmlToMarkdownSplitChunks(
102102 let currentHeaderText = ''
103103 let lineNumber = 1
104104 let lastChunkEndPosition = 0
105+ let lastSplitPosition = 0 // Track where we last split to avoid re-splitting
105106
106- function flushChunk ( ) {
107+ function flushChunk ( endPosition ?: number ) {
107108 const currentMd = getCurrentMarkdown ( processor . state )
108- const chunkContent = currentMd . slice ( lastChunkEndPosition )
109+ const chunkEnd = endPosition ?? currentMd . length
110+ const chunkContent = currentMd . slice ( lastChunkEndPosition , chunkEnd )
109111
110- if ( ! chunkContent . trim ( ) )
112+ if ( ! chunkContent . trim ( ) ) {
113+ // Still update position to avoid infinite loop
114+ lastChunkEndPosition = chunkEnd
111115 return
116+ }
112117
113118 const chunk : MarkdownChunk = {
114119 content : chunkContent . trimEnd ( ) ,
@@ -141,13 +146,18 @@ export function htmlToMarkdownSplitChunks(
141146 // Reset code language for next chunk
142147 currentChunkCodeLanguage = ''
143148
144- // Handle overlap
149+ // Track where we split (before applying overlap)
150+ lastSplitPosition = chunkEnd
151+
152+ // Handle overlap - ensure we always advance by at least 1 char
145153 if ( opts . chunkOverlap > 0 ) {
146- const overlapText = chunkContent . slice ( - opts . chunkOverlap )
147- lastChunkEndPosition = currentMd . length - overlapText . length
154+ // Cap overlap to (chunkContent.length - 1) to ensure forward progress
155+ const maxOverlap = Math . max ( 0 , chunkContent . length - 1 )
156+ const actualOverlap = Math . min ( opts . chunkOverlap , maxOverlap )
157+ lastChunkEndPosition = chunkEnd - actualOverlap
148158 }
149159 else {
150- lastChunkEndPosition = currentMd . length
160+ lastChunkEndPosition = chunkEnd
151161 }
152162
153163 lineNumber += ( chunkContent . match ( / \n / g) || [ ] ) . length
@@ -226,7 +236,30 @@ export function htmlToMarkdownSplitChunks(
226236 const currentChunkSize = opts . lengthFunction ( currentMd . slice ( lastChunkEndPosition ) )
227237
228238 if ( currentChunkSize > opts . chunkSize ) {
229- flushChunk ( )
239+ // Find optimal split point using hierarchy of separators (like RecursiveCharacterTextSplitter)
240+ const idealSplitPos = lastChunkEndPosition + opts . chunkSize
241+
242+ // Ordered by preference: paragraph > code block > line > word
243+ const separators = [ '\n\n' , '```\n' , '\n' , ' ' ]
244+ let splitPosition = - 1
245+
246+ for ( const sep of separators ) {
247+ // Find last occurrence of separator before/at ideal position
248+ const idx = currentMd . lastIndexOf ( sep , idealSplitPos )
249+ const candidateSplitPos = idx + sep . length
250+ // Only use separator if split position would be beyond our last split
251+ if ( idx >= 0 && candidateSplitPos > lastSplitPosition ) {
252+ splitPosition = candidateSplitPos
253+ break
254+ }
255+ }
256+
257+ // If no separator found before ideal position, use current length (split now)
258+ if ( splitPosition === - 1 || splitPosition <= lastChunkEndPosition ) {
259+ splitPosition = currentMd . length
260+ }
261+
262+ flushChunk ( splitPosition )
230263 }
231264 }
232265 } )
@@ -275,7 +308,8 @@ export function htmlToMarkdownSplitChunks(
275308 }
276309 }
277310
278- return chunks
311+ // Filter out empty chunks (can happen after header stripping)
312+ return chunks . filter ( chunk => chunk . content . length > 0 )
279313}
280314
281315export type { MarkdownChunk , SplitterOptions } from './types'
0 commit comments