@@ -54,7 +54,7 @@ function isWhitespace(charCode: number): boolean {
5454 */
5555export function parseHTML ( htmlChunk : string , state : MdreamProcessingState ) : {
5656 events : NodeEvent [ ]
57- partialHTML : string
57+ unprocessedHtml : string
5858} {
5959 const events : NodeEvent [ ] = [ ]
6060 let textBuffer = '' // Buffer to accumulate text content
@@ -186,7 +186,7 @@ export function parseHTML(htmlChunk: string, state: MdreamProcessingState): {
186186
187187 return {
188188 events,
189- partialHTML : textBuffer ,
189+ unprocessedHtml : textBuffer ,
190190 }
191191}
192192
@@ -806,16 +806,18 @@ export function processPartialHTMLToMarkdown(
806806 state . buffer ??= ''
807807
808808 // Check for DOCTYPE at the beginning (optimized)
809- if ( ! state . buffer && partialHtml . charCodeAt ( 0 ) === LT_CHAR
810- && partialHtml . charCodeAt ( 1 ) === EXCLAMATION_CHAR ) {
811- state . processingHTMLDocument = true
809+ if ( ! state . buffer ) {
810+ partialHtml = partialHtml . trimStart ( )
811+ if ( partialHtml . charCodeAt ( 0 ) === LT_CHAR && partialHtml . charCodeAt ( 1 ) === EXCLAMATION_CHAR ) {
812+ state . processingHTMLDocument = true
813+ }
812814 }
813815
814816 state . options ??= { chunkSize }
815817
816818 // Parse HTML into a DOM tree with events
817819 // @ts -expect-error untyped
818- const { events, partialHTML } = parseHTML ( partialHtml , state )
820+ const { events, unprocessedHtml } = parseHTML ( partialHtml , state )
819821
820822 // Process events from the parser
821823 let chunk = ''
@@ -858,5 +860,5 @@ export function processPartialHTMLToMarkdown(
858860 }
859861 }
860862
861- return { chunk, remainingHTML : partialHTML }
863+ return { chunk, remainingHTML : unprocessedHtml }
862864}
0 commit comments