In [12]:
def create_sections_Markdown(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	# Removed markdown file check - allowing all files
	# Combine all text from page_map into a single string
	all_text = ""
	page_ranges = []
	current_page = None
	start_page = None
	
	for page_num, _, text in page_map:
		if current_page is None:
			current_page = page_num
			start_page = page_num
		elif page_num != current_page + 1:
			# If there's a gap in page numbers, add the current range
			page_ranges.append((start_page, current_page))
			start_page = page_num
		current_page = page_num
		all_text += text + "\n"
	
	# Add the final page range
	if start_page is not None:
		page_ranges.append((start_page, current_page))

	# Define headers to split on
	headers_to_split_on = [
		("#", "Header 1"),
		("##", "Header 2"),
		("###", "Header 3"),
		("####", "Header 4"),
		("#####", "Header 5"),
		("######", "Header 6"),
	]

	try:
		# Initialize MarkdownHeaderTextSplitter
		markdown_splitter = MarkdownHeaderTextSplitter(
			headers_to_split_on=headers_to_split_on
		)

		# Split the text into chunks based on headers
		chunks = markdown_splitter.split_text(all_text)

		# Create input data entries for each chunk
		for idx, chunk in enumerate(chunks):
			# Get the metadata (headers) for this chunk
			metadata = chunk.metadata
			
			# Find which page range this chunk belongs to
			chunk_start = all_text.find(chunk.page_content)
			chunk_end = chunk_start + len(chunk.page_content)
			
			# Find the corresponding page range
			current_pos = 0
			chunk_page_range = None
			for start_page, end_page in page_ranges:
				range_text = ""
				for page_num, _, text in page_map:
					if start_page <= page_num <= end_page:
						range_text += text + "\n"
						current_pos += len(text) + 1
						if current_pos >= chunk_start:
							chunk_page_range = (start_page, end_page)
							break
				if chunk_page_range:
					break

			if chunk_page_range:
				start_page, end_page = chunk_page_range
				page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
			else:
				page_range = "1"  # Default to page 1 if we can't determine the range

			# Create the content with headers included
			content = ""
			for header_level in range(1, 7):
				header_key = f"Header {header_level}"
				if header_key in metadata:
					content += "#" * header_level + " " + metadata[header_key] + "\n"
			content += chunk.page_content

			input_data.append({
            'id': f"{chunk_id_prefix}_{page_range}_{idx}",
            'title': blob_name,
            'category': category_id,
            'sourcepage': f"{blob_name}::{page_range}",
            'content': content.strip()
 })

	except Exception as e:
		queue_logger.error(f"Error processing file '{blob_name}': {str(e)}")
		return []

	return input_data


In [13]:
category_id = "FormRechonizer" 
blob_name = "TestCase1"
mode = "search" 
language = "en"
blob_Connection_String = "dummy"
blob_container_name = "dummy"
base_threshold = 10000
buffer_percent = 10
overlap_sent_count = 2


import ast
import json

with open("page_map_FormRechonizer.md", "r",encoding="utf-8-sig") as file:
    page_map_content = file.read()

# If your file is like [(1, 0, 'text'), ...]
#page_map = ast.literal_eval(page_map_content)
page_map = [(1, 0, page_map_content)]

# Then pass it to your function
result_markdown= create_sections_Markdown(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

In [14]:
print(f"************************************* result_markdown *************************************\n{result_markdown}")


************************************* result_markdown *************************************
[{'id': 'TestCase1_1_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1', 'content': '<div align="center">\n<img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&height=150&section=header&text=Hemant%20Kumar&fontSize=40&fontColor=FFFFFF&animation=fadeIn&gradientColors=FF8B3D,00C4CC" alt="header"/>  \n<h1 style="color: #FF8B3D; font-family: \'JetBrains Mono\', monospace;">\n<img src="https://readme-typing-svg.herokuapp.com?font=JetBrains+Mono&weight=700&size=30&duration=2500&color=FF8B3D&center=true&vCenter=true&width=600&lines=Hey%2C+I’m+Hemant!+👋;MERN+Stack+Maestro+✨;Crafting+Web+Magic+💻;Let’s+Build+Something+Great!+🤝" alt="typing"/>\n</h1>  \n<div style="margin: 20px 0;">\n<a href="https://x.com/HemantK66009549" target="_blank">\n<img src="https://img.shields.io/badge/X-FF8B3D?style=for-the-badge&logo=x&logoColor=white&labelColor=00C4CC" al