In [1]:
import re

def split_into_sentences(text):
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())
	return [s.strip() for s in sentences if s.strip()]

def chunk_sentences_by_char_limit(sentences, limit):
	chunks = []
	current_chunk = ""
	for sentence in sentences:
		if len(current_chunk) + len(sentence) + 1 <= limit:
			current_chunk += " " + sentence if current_chunk else sentence
		else:
			chunks.append(current_chunk)
			current_chunk = sentence
	if current_chunk:
		chunks.append(current_chunk)
	return chunks



def create_sections_default(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	upper_limit = int(base_threshold + (base_threshold * buffer_percent / 100))
	lower_limit = int(base_threshold - (base_threshold * buffer_percent / 100))
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			print(f"Image error for '{blob_name}': {e}")
	else:
		# Normalize page_map: reduce (page_num, offset, text) → (page_num, text)
		normalized_map = []
		for item in page_map:
			if len(item) == 3:
				page_num, _, text = item
			else:
				raise ValueError(f"Unexpected page_map format: {item}")
			normalized_map.append((page_num, text))
		page_map = normalized_map

		i = 0
		total_pages = len(page_map)
		while i < total_pages:
			merged_pages = []
			merged_text = ""
			current_length = 0

			# Merge as many pages as needed to hit lower_limit (unless last page)
			merge_start = i
			while i < total_pages and current_length < lower_limit:
				page_num, page_text = page_map[i]
				sentences = split_into_sentences(page_text)
				clean_text = ' '.join(sentences)
				merged_text += (" " + clean_text if merged_text else clean_text)
				current_length = len(merged_text)
				merged_pages.append(page_num)
				i += 1

			# Special: If we're at the last page and merged_text still isn't enough, just use whatever is left

			if current_length == 0:
				break

			merged_sentences = split_into_sentences(merged_text)
			start_page = merged_pages[0]
			end_page = merged_pages[-1]
			page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)

			if current_length <= upper_limit:
				# Only one chunk, use merged range
				input_data.append({
					'id': f"{chunk_id_prefix}_{page_range}_0",
					'title': blob_name,
					'category': category_id,
					'sourcepage': f"{blob_name}::{page_range}",
					'content': merged_text
				})
			else:
				# Too big: chunk by sentences
				chunks = chunk_sentences_by_char_limit(merged_sentences, base_threshold)
				for idx, chunk in enumerate(chunks):
					if idx == 0:
						# First chunk gets merged range
						input_data.append({
							'id': f"{chunk_id_prefix}_{page_range}_0",
							'title': blob_name,
							'category': category_id,
							'sourcepage': f"{blob_name}::{page_range}",
							'content': chunk
						})
					else:
						# Subsequent chunks use only last page and increment id
						input_data.append({
							'id': f"{chunk_id_prefix}_{end_page}_{idx}",
							'title': blob_name,
							'category': category_id,
							'sourcepage': f"{blob_name}::{end_page}",
							'content': chunk
						})

	# --- Overlap logic here ---
	if overlap_sent_count > 0:
		for idx in range(1, len(input_data)):
			prev_sentences = split_into_sentences(input_data[idx - 1]['content'])
			to_prepend = ' '.join(prev_sentences[-overlap_sent_count:]) if overlap_sent_count <= len(prev_sentences) else ' '.join(prev_sentences)
			if not input_data[idx]['content'].startswith(to_prepend):
				input_data[idx]['content'] = to_prepend + " " + input_data[idx]['content']
	with open("input_data.txt", "w") as f:
		f.write(json.dumps(input_data, indent=4))
	return input_data


In [2]:
def create_sections_hybrid(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold=10000, buffer_percent=20, overlap_sent_count=3
):
    """
    Hybrid chunking method that combines fixed-size and semantic chunking.
    Args:
        category_id: Category ID for the document
        blob_name: Name of the blob file
        page_map: List of tuples containing (page_num, offset, text)
        mode: Processing mode
        language: Document language
        blob_Connection_String: Azure blob connection string
        blob_container_name: Azure blob container name
        base_threshold: Base size threshold for chunks
        buffer_percent: Percentage buffer for chunk size flexibility
        overlap_sent_count: Number of sentences to overlap between chunks
    """
    chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
    input_data = []

    # Handle image files
    if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
        try:
            image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
            for idx, content in enumerate(image_descriptions):
                input_data.append({
                    'id': f"{chunk_id_prefix}_{idx}",
                    'title': blob_name,
                    'category': category_id,
                    'sourcepage': blob_name_from_file_page(blob_name),
                    'content': content
                })
        except Exception as e:
            queue_logger.error(f"Image error for '{blob_name}': {e}")
        return input_data

    # Normalize page_map to (page_num, text) format
    normalized_map = []
    for item in page_map:
        if len(item) == 3:
            page_num, _, text = item
        else:
            raise ValueError(f"Unexpected page_map format: {item}")
        normalized_map.append((page_num, text))

    def is_semantic_boundary(text):
        """Check if text contains semantic boundaries like headers, lists, or tables."""
        semantic_markers = [
            r'^#+\s',  # Markdown headers
            r'^\d+\.\s',  # Numbered lists
            r'^[•\-\*]\s',  # Bullet points
            r'<table',  # HTML tables
            r'^\s*[A-Z][A-Z\s]+:',  # Section headers
            r'^\s*[IVX]+\.',  # Roman numerals
        ]
        return any(re.search(pattern, text, re.MULTILINE) for pattern in semantic_markers)

    def split_into_semantic_units(text):
        """Split text into semantic units based on natural boundaries."""
        # Split by double newlines first
        units = re.split(r'\n\s*\n', text)
        
        # Further split units that are too long
        result = []
        for unit in units:
            if len(unit) > base_threshold:
                # Split by single newlines if unit is too long
                sub_units = unit.split('\n')
                current_unit = ""
                for sub in sub_units:
                    if len(current_unit) + len(sub) + 1 <= base_threshold:
                        current_unit += "\n" + sub if current_unit else sub
                    else:
                        if current_unit:
                            result.append(current_unit)
                        current_unit = sub
                if current_unit:
                    result.append(current_unit)
            else:
                result.append(unit)
        return result

    def create_chunk_with_overlap(unit, prev_chunk=None):
        """Create a chunk with overlap from previous chunk if available."""
        if prev_chunk and overlap_sent_count > 0:
            prev_sentences = split_into_sentences(prev_chunk)
            overlap_text = ' '.join(prev_sentences[-overlap_sent_count:])
            if not unit.startswith(overlap_text):
                unit = overlap_text + " " + unit
        return unit

    # Process each page and create chunks
    current_chunk = ""
    current_pages = []
    prev_chunk = None

    for page_num, page_text in normalized_map:
        semantic_units = split_into_semantic_units(page_text)
        
        for unit in semantic_units:
            if is_semantic_boundary(unit):
                # If we have accumulated content, create a chunk
                if current_chunk:
                    chunk = create_chunk_with_overlap(current_chunk, prev_chunk)
                    if current_pages:
                        page_range = f"{current_pages[0]}-{current_pages[-1]}" if len(current_pages) > 1 else str(current_pages[0])
                        input_data.append({
                            'id': f"{chunk_id_prefix}_{page_range}_{len(input_data)}",
                            'title': blob_name,
                            'category': category_id,
                            'sourcepage': f"{blob_name}::{page_range}",
                            'content': chunk
                        })
                        prev_chunk = chunk
                    current_chunk = ""
                    current_pages = []

                # Create a new chunk for the semantic unit
                chunk = create_chunk_with_overlap(unit, prev_chunk)
                input_data.append({
                    'id': f"{chunk_id_prefix}_{page_num}_{len(input_data)}",
                    'title': blob_name,
                    'category': category_id,
                    'sourcepage': f"{blob_name}::{page_num}",
                    'content': chunk
                })
                prev_chunk = chunk
            else:
                if len(current_chunk) + len(unit) + 1 <= base_threshold:
                    current_chunk += "\n" + unit if current_chunk else unit
                    if page_num not in current_pages:
                        current_pages.append(page_num)
                else:
                    # Create chunk from accumulated content
                    if current_chunk:
                        chunk = create_chunk_with_overlap(current_chunk, prev_chunk)
                        page_range = f"{current_pages[0]}-{current_pages[-1]}" if len(current_pages) > 1 else str(current_pages[0])
                        input_data.append({
                            'id': f"{chunk_id_prefix}_{page_range}_{len(input_data)}",
                            'title': blob_name,
                            'category': category_id,
                            'sourcepage': f"{blob_name}::{page_range}",
                            'content': chunk
                        })
                        prev_chunk = chunk
                    
                    # Start new chunk with current unit
                    current_chunk = unit
                    current_pages = [page_num]

    # Handle any remaining content
    if current_chunk:
        chunk = create_chunk_with_overlap(current_chunk, prev_chunk)
        page_range = f"{current_pages[0]}-{current_pages[-1]}" if len(current_pages) > 1 else str(current_pages[0])
        input_data.append({
            'id': f"{chunk_id_prefix}_{page_range}_{len(input_data)}",
            'title': blob_name,
            'category': category_id,
            'sourcepage': f"{blob_name}::{page_range}",
            'content': chunk
        })

    return input_data


In [3]:
from langchain.text_splitter import TokenTextSplitter

def create_sections_token(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		# Combine all text from page_map into a single string
		all_text = ""
		page_ranges = []
		current_page = None
		start_page = None
		
		for page_num, _, text in page_map:
			if current_page is None:
				current_page = page_num
				start_page = page_num
			elif page_num != current_page + 1:
				# If there's a gap in page numbers, add the current range
				page_ranges.append((start_page, current_page))
				start_page = page_num
			current_page = page_num
			all_text += text + "\n"
		
		# Add the final page range
		if start_page is not None:
			page_ranges.append((start_page, current_page))

		# Initialize TokenTextSplitter
		text_splitter = TokenTextSplitter(
			chunk_size=base_threshold,
			chunk_overlap=int(base_threshold * buffer_percent / 100),
			encoding_name="cl100k_base"  # This is the encoding used by GPT models
		)

		# Split the text into chunks
		chunks = text_splitter.split_text(all_text)

		# Create input data entries for each chunk
		for idx, chunk in enumerate(chunks):
			# Find which page range this chunk belongs to
			chunk_start = all_text.find(chunk)
			chunk_end = chunk_start + len(chunk)
			
			# Find the corresponding page range
			current_pos = 0
			chunk_page_range = None
			for start_page, end_page in page_ranges:
				range_text = ""
				for page_num, _, text in page_map:
					if start_page <= page_num <= end_page:
						range_text += text + "\n"
						current_pos += len(text) + 1
						if current_pos >= chunk_start:
							chunk_page_range = (start_page, end_page)
							break
				if chunk_page_range:
					break

			if chunk_page_range:
				start_page, end_page = chunk_page_range
				page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
			else:
				page_range = "1"  # Default to page 1 if we can't determine the range

			input_data.append({
				'id': f"{chunk_id_prefix}_{page_range}_{idx}",
				'title': blob_name,
				'category': category_id,
				'sourcepage': f"{blob_name}::{page_range}",
				'content': chunk.strip()
			})

	return input_data


In [4]:
import nltk
from nltk.tokenize import TextTilingTokenizer

def create_sections_TextTilling(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
):
    chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
    input_data = []

    if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
        try:
            image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
            for idx, content in enumerate(image_descriptions):
                input_data.append({
                    'id': f"{chunk_id_prefix}_{idx}",
                    'title': blob_name,
                    'category': category_id,
                    'sourcepage': blob_name_from_file_page(blob_name),
                    'content': content
                })
        except Exception as e:
            print(f"Image error for '{blob_name}': {e}")
    else:
        # Combine all text from page_map into a single string
        all_text = ""
        page_ranges = []
        current_page = None
        start_page = None
        
        for page_num, _, text in page_map:
            if current_page is None:
                current_page = page_num
                start_page = page_num
            elif page_num != current_page + 1:
                page_ranges.append((start_page, current_page))
                start_page = page_num
            current_page = page_num
            all_text += text + "\n\n"  # Ensure paragraph breaks are present
        
        if start_page is not None:
            page_ranges.append((start_page, current_page))

        try:
            # Ensure there's enough text to process
            if len(all_text.strip()) < 50:
                print(f"Not enough text to split for '{blob_name}'")
                return []

            tt = TextTilingTokenizer(w=20, k=10)

            chunks = tt.tokenize(all_text)

            for idx, chunk in enumerate(chunks):
                chunk_start = all_text.find(chunk)
                chunk_end = chunk_start + len(chunk)
                
                current_pos = 0
                chunk_page_range = None
                for start_page, end_page in page_ranges:
                    range_text = ""
                    for page_num, _, text in page_map:
                        if start_page <= page_num <= end_page:
                            range_text += text + "\n"
                            current_pos += len(text) + 1
                            if current_pos >= chunk_start:
                                chunk_page_range = (start_page, end_page)
                                break
                    if chunk_page_range:
                        break

                if chunk_page_range:
                    start_page, end_page = chunk_page_range
                    page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
                else:
                    page_range = "1"

                cleaned_chunk = chunk.strip()

                if len(cleaned_chunk) > base_threshold:
                    sentences = nltk.sent_tokenize(cleaned_chunk)
                    current_chunk = ""
                    for sentence in sentences:
                        if len(current_chunk) + len(sentence) + 1 <= base_threshold:
                            current_chunk += " " + sentence if current_chunk else sentence
                        else:
                            if current_chunk:
                                input_data.append({
                                    'id': f"{chunk_id_prefix}_{page_range}_{idx}_sub",
                                    'title': blob_name,
                                    'category': category_id,
                                    'sourcepage': f"{blob_name}::{page_range}",
                                    'content': current_chunk.strip()
                                })
                            current_chunk = sentence
                    if current_chunk:
                        input_data.append({
                            'id': f"{chunk_id_prefix}_{page_range}_{idx}_sub",
                            'title': blob_name,
                            'category': category_id,
                            'sourcepage': f"{blob_name}::{page_range}",
                            'content': current_chunk.strip()
                        })
                else:
                    input_data.append({
                        'id': f"{chunk_id_prefix}_{page_range}_{idx}",
                        'title': blob_name,
                        'category': category_id,
                        'sourcepage': f"{blob_name}::{page_range}",
                        'content': cleaned_chunk
                    })

        except StopIteration:
            print(f"TextTilingTokenizer failed due to insufficient data in '{blob_name}'.")
            return []
        except Exception as e:
            print(f"Error processing file '{blob_name}' with TextTilingTokenizer: {str(e)}")
            return []

    return input_data


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def create_sections_recursive(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		try:
			# Combine all text from page_map
			all_text = ""
			page_ranges = {}  # Track which text belongs to which pages
			current_position = 0
			
			for page_num, _, text in page_map:
				# Clean and normalize text
				cleaned_text = re.sub(r'\s+', ' ', text).strip()
				if cleaned_text:
					all_text += cleaned_text + "\n\n"
					# Track the page range for this text
					end_position = current_position + len(cleaned_text)
					page_ranges[(current_position, end_position)] = page_num
					current_position = end_position + 2  # +2 for the "\n\n"

			if not all_text.strip():
				return []

			# Define regex patterns for PDF content
			patterns = [
				# Headers (numbered and unnumbered)
				r'(?:^|\n)(?:\d+\.\s*)?[A-Z][A-Za-z\s]{2,}(?:\n|$)',  # Headers starting with capital letters
				r'(?:^|\n)(?:\d+\.\d+\.\s*)?[A-Z][A-Za-z\s]{2,}(?:\n|$)',  # Sub-headers with numbers
				
				# Section breaks
				r'\n\s*[-–—]{3,}\s*\n',  # Horizontal lines
				r'\n\s*={3,}\s*\n',  # Double lines
				
				# Paragraph breaks
				r'\n\s*\n',  # Multiple newlines
				
				# Lists
				r'(?:^|\n)(?:\d+\.|\*|\-|\•)\s+',  # Numbered and bulleted lists
				
				# Tables (basic pattern)
				r'\n\s*\|.*\|\s*\n',  # Table rows
				
				# Page breaks (if preserved in text)
				r'\f',  # Form feed character
				
				# Common PDF artifacts
				r'Page \d+ of \d+',  # Page numbers
				r'©.*?All rights reserved',  # Copyright notices
			]

			# Initialize the text splitter with our patterns
			text_splitter = RecursiveCharacterTextSplitter(
				separators=patterns,
				chunk_size=base_threshold,
				chunk_overlap=int(base_threshold * buffer_percent / 100),
				length_function=len,
				is_separator_regex=True
			)

			# Split the text into chunks
			chunks = text_splitter.split_text(all_text)

			# Process each chunk and determine its page range
			for idx, chunk in enumerate(chunks):
				# Find the page range for this chunk
				chunk_start = all_text.find(chunk)
				chunk_end = chunk_start + len(chunk)
				
				# Find which pages this chunk spans
				chunk_pages = set()
				for (start, end), page in page_ranges.items():
					if (chunk_start <= end and chunk_end >= start):
						chunk_pages.add(page)
				
				if chunk_pages:
					start_page = min(chunk_pages)
					end_page = max(chunk_pages)
					page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
					
					# Clean the chunk text
					cleaned_chunk = re.sub(r'\s+', ' ', chunk).strip()
					
					input_data.append({
						'id': f"{chunk_id_prefix}_{page_range}_{idx}",
						'title': blob_name,
						'category': category_id,
						'sourcepage': f"{blob_name}::{page_range}",
						'content': cleaned_chunk
					})

		except Exception as e:
			queue_logger.error(f"Error processing file '{blob_name}' with regex-based chunking: {str(e)}")
			return []

	return input_data

In [6]:
from typing import List, Tuple, Dict
from dataclasses import dataclass
from langchain.schema import Document

@dataclass
class ChunkConfig:
    window_size: int
    stride: int
    min_chunk_size: int
    max_chunk_size: int

class SlidingWindowChunker:
    def __init__(self, config: ChunkConfig):
        self.config = config

    def split_text(self, text: str, page_info: Dict[int, Tuple[int, int]]) -> List[Tuple[str, List[int]]]:
        """
        Split text using sliding window approach while maintaining page information
        Returns list of (chunk_text, page_numbers) tuples
        """
        chunks = []
        text_length = len(text)
        start = 0

        while start < text_length:
            # Calculate window end position
            end = min(start + self.config.window_size, text_length)
            
            # If we're not at the end, try to find a good break point
            if end < text_length:
                # Look for sentence endings or paragraph breaks
                break_chars = ['. ', '! ', '? ', '\n\n']
                for char in break_chars:
                    last_break = text.rfind(char, start, end)
                    if last_break != -1 and last_break > start + self.config.min_chunk_size:
                        end = last_break + 1
                        break

            # Extract chunk
            chunk = text[start:end].strip()
            
            # Get pages this chunk spans
            chunk_pages = self._get_chunk_pages(chunk, start, end, page_info)
            
            if chunk_pages:
                chunks.append((chunk, chunk_pages))

            # Move window forward by stride
            start = start + self.config.stride

        return chunks

    def _get_chunk_pages(self, chunk: str, start: int, end: int, page_info: Dict[int, Tuple[int, int]]) -> List[int]:
        """Determine which pages the chunk spans"""
        pages = set()
        for page_num, (page_start, page_end) in page_info.items():
            if (start <= page_end and end >= page_start):
                pages.add(page_num)
        return sorted(list(pages))

def create_sections_sliding_window(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		try:
			# Combine all text and track page positions
			all_text = ""
			page_info = {}  # Maps page numbers to (start, end) positions in all_text
			current_position = 0

			for page_num, _, text in page_map:
				# Clean and normalize text
				cleaned_text = re.sub(r'\s+', ' ', text).strip()
				if cleaned_text:
					# Add page separator
					if all_text:
						all_text += "\n\n"
						current_position += 2
					
					# Record page start position
					page_start = current_position
					
					# Add page text
					all_text += cleaned_text
					current_position += len(cleaned_text)
					
					# Record page end position
					page_info[page_num] = (page_start, current_position)

			if not all_text.strip():
				return []

			# Configure sliding window chunker
			chunk_config = ChunkConfig(
				window_size=base_threshold,
				stride=int(base_threshold * (1 - buffer_percent/100)),  # Overlap based on buffer_percent
				min_chunk_size=int(base_threshold * 0.5),  # Minimum chunk size
				max_chunk_size=base_threshold
			)

			# Initialize chunker and split text
			chunker = SlidingWindowChunker(chunk_config)
			chunks = chunker.split_text(all_text, page_info)

			# Process chunks
			for idx, (chunk_text, pages) in enumerate(chunks):
				if not pages:
					continue

				start_page = min(pages)
				end_page = max(pages)
				page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)

				# Clean the chunk text
				cleaned_chunk = re.sub(r'\s+', ' ', chunk_text).strip()

				input_data.append({
					'id': f"{chunk_id_prefix}_{page_range}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': f"{blob_name}::{page_range}",
					'content': cleaned_chunk
				})

		except Exception as e:
			queue_logger.error(f"Error processing file '{blob_name}' with sliding window chunking: {str(e)}")
			return []

	return input_data

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def create_sections_recursive(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		# Combine all text from page_map into a single string
		all_text = ""
		page_ranges = []
		current_page = None
		start_page = None
		
		for page_num, _, text in page_map:
			if current_page is None:
				current_page = page_num
				start_page = page_num
			elif page_num != current_page + 1:
				# If there's a gap in page numbers, add the current range
				page_ranges.append((start_page, current_page))
				start_page = page_num
			current_page = page_num
			all_text += text + "\n"
		
		# Add the final page range
		if start_page is not None:
			page_ranges.append((start_page, current_page))

		# Initialize RecursiveCharacterTextSplitter
		text_splitter = RecursiveCharacterTextSplitter(
			chunk_size=base_threshold,
			chunk_overlap=int(base_threshold * buffer_percent / 100),
			length_function=len,
			separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
		)

		# Split the text into chunks
		chunks = text_splitter.split_text(all_text)

		# Create input data entries for each chunk
		for idx, chunk in enumerate(chunks):
			# Find which page range this chunk belongs to
			chunk_start = all_text.find(chunk)
			chunk_end = chunk_start + len(chunk)
			
			# Find the corresponding page range
			current_pos = 0
			chunk_page_range = None
			for start_page, end_page in page_ranges:
				range_text = ""
				for page_num, _, text in page_map:
					if start_page <= page_num <= end_page:
						range_text += text + "\n"
						current_pos += len(text) + 1
						if current_pos >= chunk_start:
							chunk_page_range = (start_page, end_page)
							break
				if chunk_page_range:
					break

			if chunk_page_range:
				start_page, end_page = chunk_page_range
				page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
			else:
				page_range = "1"  # Default to page 1 if we can't determine the range

			input_data.append({
				'id': f"{chunk_id_prefix}_{page_range}_{idx}",
				'title': blob_name,
				'category': category_id,
				'sourcepage': f"{blob_name}::{page_range}",
				'content': chunk.strip()
			})

	return input_data

In [8]:
import spacy
from langchain.text_splitter import SpacyTextSplitter
import re

def create_sections_spacy(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
):
    chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
    input_data = []

    print(f"\n=== 🏁 Start Processing: {blob_name} ===")
    print(f"🔍 Category ID: {category_id}")
    print(f"🔍 Mode: {mode}, Language: {language}")
    print(f"🔍 Thresholds: base={base_threshold}, buffer%={buffer_percent}, overlap_sents={overlap_sent_count}")
    print(f"🔍 page_map: {page_map}")

    # Fallback for empty page_map
    if not page_map:
        print("⚠️ Fallback: Empty page_map detected. Adding placeholder chunk.")
        input_data.append({
            'id': f"{chunk_id_prefix}_empty",
            'title': blob_name,
            'category': category_id,
            'sourcepage': blob_name,
            'content': "[No content extracted from page_map]"
        })
        return input_data

    # Load SpaCy model with fallback
    print("🧠 Loading SpaCy model...")
    try:
        nlp = spacy.load(f"{language}_core_news_sm")
        print(f"✅ SpaCy model loaded: {nlp.meta['name']}")
    except Exception as e:
        print(f"⚠️ Failed to load {language} model. Falling back to English. Error: {e}")
        nlp = spacy.load("en_core_web_sm")
        print(f"✅ Fallback SpaCy model loaded: {nlp.meta['name']}")

    # Initialize text splitter
    text_splitter = SpacyTextSplitter(
        pipeline="sentencizer",
        chunk_size=base_threshold,
        chunk_overlap=int(base_threshold * buffer_percent / 100)
    )
    print(f"🧩 Text Splitter initialized with chunk_size={base_threshold}, overlap={int(base_threshold * buffer_percent / 100)}")

    current_chunk = []
    current_pages = set()
    current_length = 0

    # 🚀 Process each page
    for idx, (page_num, _, text) in enumerate(page_map):
        print(f"\n📄 --- Processing page {page_num} (index {idx}) ---")
        if text is None:
            print(f"⚠️ Page {page_num} has no text (None). Skipping.")
            continue

        cleaned_text = re.sub(r'\s+', ' ', text).strip()
        print(f"📝 Original text: {text}")
        print(f"🧹 Cleaned text: {cleaned_text}")

        if not cleaned_text:
            print("⚠️ Cleaned text is empty. Skipping.")
            continue

        try:
            doc = nlp(cleaned_text)
            sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
            print(f"✂️ Extracted sentences: {sentences}")
        except Exception as e:
            print(f"❌ Error during SpaCy sentence splitting on page {page_num}: {e}")
            continue

        if not sentences:
            print(f"⚠️ No sentences extracted from page {page_num}.")
            continue

        for sentence in sentences:
            sentence_length = len(sentence)
            if current_length + sentence_length > base_threshold and current_chunk:
                chunk_text = " ".join(current_chunk)
                if chunk_text.strip():
                    start_page = min(current_pages)
                    end_page = max(current_pages)
                    page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
                    print(f"📦 Adding chunk ({page_range}): {chunk_text[:50]}...")
                    input_data.append({
                        'id': f"{chunk_id_prefix}_{page_range}_{len(input_data)}",
                        'title': blob_name,
                        'category': category_id,
                        'sourcepage': f"{blob_name}::{page_range}",
                        'content': chunk_text.strip()
                    })

                overlap_sentences = current_chunk[-overlap_sent_count:] if overlap_sent_count > 0 else []
                current_chunk = overlap_sentences
                current_length = sum(len(s) for s in overlap_sentences)
                current_pages = {end_page}

            current_chunk.append(sentence)
            current_length += sentence_length
            current_pages.add(page_num)

    # 📌 Add final chunk if any
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        if chunk_text.strip():
            start_page = min(current_pages)
            end_page = max(current_pages)
            page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
            print(f"📦 Adding FINAL chunk ({page_range}): {chunk_text[:50]}...")
            input_data.append({
                'id': f"{chunk_id_prefix}_{page_range}_{len(input_data)}",
                'title': blob_name,
                'category': category_id,
                'sourcepage': f"{blob_name}::{page_range}",
                'content': chunk_text.strip()
            })

    # ✅ Summary
    print(f"\n=== 🏁 Finished Processing {blob_name} ===")
    print(f"📊 Total Chunks Created: {len(input_data)}")
    for item in input_data:
        print(f"🧾 ID: {item['id']}, Source Page: {item['sourcepage']}, Content Preview: {item['content'][:50]}...")

    return input_data


In [9]:
%pip install -qU langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)
from typing import List, Dict, Tuple

def create_sections_code(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		try:
			# Get OpenAI API key from environment or Azure Key Vault
			try:
				openai_api_key = os.getenv("OPENAI_API_KEY")
				if not openai_api_key:
					# Try to get from Azure Key Vault
					openai_api_key = KeyVaultManager.fetch_tag_value_from_key_and_tag(
						DOCUAGENT_KEYVAULT_NAME, 
						'AZURE_OPENAI_API_KEY'
					)
			except Exception as e:
				queue_logger.error(f"Failed to get OpenAI API key: {str(e)}")
				return []

			# Map language to LangChain Language enum
			language_map = {
				'python': Language.PYTHON,
				'javascript': Language.JS,
				'typescript': Language.TS,
				'java': Language.JAVA,
				'cpp': Language.CPP,
				'csharp': Language.CSHARP,
				'php': Language.PHP,
				'ruby': Language.RUBY,
				'swift': Language.SWIFT,
				'go': Language.GO,
				'rust': Language.RUST,
				'scala': Language.SCALA,
				'kotlin': Language.KOTLIN,
				'html': Language.HTML,
				'css': Language.CSS,
				'markdown': Language.MARKDOWN,
				'latex': Language.LATEX,
				'sql': Language.SQL,
				'shell': Language.SHELL,
				'text': Language.TEXT
			}

			# Get the appropriate language enum or default to TEXT
			lang_enum = language_map.get(language.lower(), Language.TEXT)

			# Initialize RecursiveCharacterTextSplitter with language-specific settings
			text_splitter = RecursiveCharacterTextSplitter(
				language=lang_enum,
				chunk_size=base_threshold,
				chunk_overlap=int(base_threshold * buffer_percent / 100),
				separators=lang_enum.separators if hasattr(lang_enum, 'separators') else ["\n\n", "\n", ".", "!", "?", ",", " ", ""]
			)

			# Combine all text while maintaining page tracking
			all_text = ""
			page_positions = []  # List of (start_pos, end_pos, page_num)
			current_pos = 0

			for page_num, _, text in page_map:
				cleaned_text = re.sub(r'\s+', ' ', text).strip()
				if cleaned_text:
					start_pos = current_pos
					all_text += cleaned_text + "\n\n"  # Add double newline for better separation
					current_pos = len(all_text)
					page_positions.append((start_pos, current_pos, page_num))

			# Split text using recursive splitter
			chunks = text_splitter.split_text(all_text)

			# Process each chunk and determine its page range
			for chunk_idx, chunk in enumerate(chunks):
				chunk_start = all_text.find(chunk)
				chunk_end = chunk_start + len(chunk)

				# Find pages that overlap with this chunk
				chunk_pages = set()
				for start_pos, end_pos, page_num in page_positions:
					if (chunk_start <= end_pos and chunk_end >= start_pos):
						chunk_pages.add(page_num)

				if chunk_pages:
					start_page = min(chunk_pages)
					end_page = max(chunk_pages)
					page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)

					# Add overlap from previous chunk if needed
					if overlap_sent_count > 0 and chunk_idx > 0:
						prev_chunk = chunks[chunk_idx - 1]
						prev_sentences = split_into_sentences(prev_chunk)
						overlap_text = ' '.join(prev_sentences[-overlap_sent_count:])
						if overlap_text and not chunk.startswith(overlap_text):
							chunk = overlap_text + " " + chunk

					# Clean up the chunk text
					cleaned_chunk = re.sub(r'\n{3,}', '\n\n', chunk.strip())  # Normalize newlines

					input_data.append({
						'id': f"{chunk_id_prefix}_{page_range}_{chunk_idx}",
						'title': blob_name,
						'category': category_id,
						'sourcepage': f"{blob_name}::{page_range}",
						'content': cleaned_chunk
					})

		except Exception as e:
			queue_logger.error(f"Error processing file '{blob_name}' with RecursiveCharacterTextSplitter: {str(e)}")
			return []

	return input_data


In [11]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
def create_sections_Markdown(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	# Check if file is a markdown file
	if not blob_name.lower().endswith('.md'):
		queue_logger.error(f"File '{blob_name}' is not a markdown file. Only .md files are supported.")
		return []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		queue_logger.error(f"Image files are not supported for markdown processing.")
		return []
	else:
		# Combine all text from page_map into a single string
		all_text = ""
		page_ranges = []
		current_page = None
		start_page = None
		
		for page_num, _, text in page_map:
			if current_page is None:
				current_page = page_num
				start_page = page_num
			elif page_num != current_page + 1:
				# If there's a gap in page numbers, add the current range
				page_ranges.append((start_page, current_page))
				start_page = page_num
			current_page = page_num
			all_text += text + "\n"
		
		# Add the final page range
		if start_page is not None:
			page_ranges.append((start_page, current_page))

		# Define headers to split on
		headers_to_split_on = [
			("#", "Header 1"),
			("##", "Header 2"),
			("###", "Header 3"),
			("####", "Header 4"),
			("#####", "Header 5"),
			("######", "Header 6"),
		]

		try:
			# Initialize MarkdownHeaderTextSplitter
			markdown_splitter = MarkdownHeaderTextSplitter(
				headers_to_split_on=headers_to_split_on
			)

			# Split the text into chunks based on headers
			chunks = markdown_splitter.split_text(all_text)

			# Create input data entries for each chunk
			for idx, chunk in enumerate(chunks):
				# Get the metadata (headers) for this chunk
				metadata = chunk.metadata
				
				# Find which page range this chunk belongs to
				chunk_start = all_text.find(chunk.page_content)
				chunk_end = chunk_start + len(chunk.page_content)
				
				# Find the corresponding page range
				current_pos = 0
				chunk_page_range = None
				for start_page, end_page in page_ranges:
					range_text = ""
					for page_num, _, text in page_map:
						if start_page <= page_num <= end_page:
							range_text += text + "\n"
							current_pos += len(text) + 1
							if current_pos >= chunk_start:
								chunk_page_range = (start_page, end_page)
								break
					if chunk_page_range:
						break

				if chunk_page_range:
					start_page, end_page = chunk_page_range
					page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
				else:
					page_range = "1"  # Default to page 1 if we can't determine the range

				# Create the content with headers included
				content = ""
				for header_level in range(1, 7):
					header_key = f"Header {header_level}"
					if header_key in metadata:
						content += "#" * header_level + " " + metadata[header_key] + "\n"
				content += chunk.page_content

				input_data.append({
					'id': f"{chunk_id_prefix}_{page_range}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': f"{blob_name}::{page_range}",
					'content': content.strip()
				})

		except Exception as e:
			print(f"Error processing markdown file '{blob_name}': {str(e)}")
			return []

	return input_data

In [12]:
category_id = "FormRechonizer" 
blob_name = "TestCase1"
mode = "search" 
language = "en"
blob_Connection_String = "dummy"
blob_container_name = "dummy"
base_threshold = 10000
buffer_percent = 10
overlap_sent_count = 2


import ast
import json

with open("page_map_FormRechonizer.txt", "r") as file:
    page_map_content = file.read()

# If your file is like [(1, 0, 'text'), ...]
page_map = ast.literal_eval(page_map_content)

# Then pass it to your function
results_default = create_sections_default(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
)

# Then pass it to your function
results_hybrid = create_sections_hybrid(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
)

result_token = create_sections_token(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

result_TextTilling = create_sections_TextTilling(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

result_regex = create_sections_recursive(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

result_sliding_window = create_sections_sliding_window(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

result_recursive = create_sections_recursive(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

result_spacy= create_sections_spacy(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

#result_AI21 = create_sections_AI21(
	#category_id, blob_name, page_map, mode, language,
	#blob_Connection_String, blob_container_name,
	#base_threshold, buffer_percent, overlap_sent_count)

#result_code = create_sections_code(
	#category_id, blob_name, page_map, mode, language,
	#blob_Connection_String, blob_container_name,
	#base_threshold, buffer_percent, overlap_sent_count
#)

result_markdown= create_sections_Markdown(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)


=== 🏁 Start Processing: TestCase1 ===
🔍 Category ID: FormRechonizer
🔍 Mode: search, Language: en
🔍 Thresholds: base=10000, buffer%=10, overlap_sents=2
🔍 page_map: [(1, 0, 'Tall Tales\nA short story collection\nby Grade 10 '), (2, 48, 'Contents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Selection, by John Kim</td><td>4

NameError: name 'queue_logger' is not defined

In [None]:
print(f"************************************* Page Map *************************************\n{page_map}")


************************************* Page Map *************************************
[(1, 0, 'Tall Tales\nA short story collection\nby Grade 10 '), (2, 48, 'Contents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Selection, by John Kim</td><td>43</td></tr><tr><td>Growing Up, by William Ye.</td><td>46</td></tr><tr><td>The B

In [None]:
print(f"************************************* results_default *************************************\n{results_default}")


************************************* results_default *************************************
[{'id': 'TestCase1_1-6_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-6', 'content': 'Tall Tales\nA short story collection\nby Grade 10 Contents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Sel

In [None]:
print(f"************************************* results_hybrid *************************************\n{results_hybrid}")


************************************* results_hybrid *************************************
[{'id': 'TestCase1_1_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1', 'content': 'Tall Tales\nA short story collection\nby Grade 10 '}, {'id': 'TestCase1_2_1', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::2', 'content': 'Tall Tales\nA short story collection\nby Grade 10 Contents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaerte

In [None]:
print(f"************************************* result_token *************************************\n{result_token}")


************************************* result_token *************************************
[{'id': 'TestCase1_1-10_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-10', 'content': 'Tall Tales\nA short story collection\nby Grade 10 \nContents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Se

In [None]:
print(f"************************************* result_TextTilling *************************************\n{result_TextTilling}")


************************************* result_TextTilling *************************************
[{'id': 'TestCase1_1-10_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-10', 'content': 'Tall Tales\nA short story collection\nby Grade 10 \n\nContents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><t

In [None]:
print(f"************************************* result_regex *************************************\n{result_regex}")


************************************* result_regex *************************************
[{'id': 'TestCase1_1-10_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-10', 'content': 'Tall Tales\nA short story collection\nby Grade 10 \nContents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Se

In [None]:
print(f"************************************* result_sliding_window *************************************\n{result_sliding_window}")


************************************* result_sliding_window *************************************
[{'id': 'TestCase1_1-6_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-6', 'content': 'Tall Tales A short story collection by Grade 10 Contents <table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The 

In [None]:
print(f"************************************* result_recursive *************************************\n{result_recursive}")


************************************* result_recursive *************************************
[{'id': 'TestCase1_1-10_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-10', 'content': 'Tall Tales\nA short story collection\nby Grade 10 \nContents\n<table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>Th

In [None]:
print(f"************************************* result_spacy *************************************\n{result_spacy}")


************************************* result_spacy *************************************
[{'id': 'TestCase1_1-6_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-6', 'content': 'Tall Tales A short story collection by Grade 10 Contents <table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td> The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td> The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Selecti

In [None]:
print(f"************************************* result_AI21 *************************************\n{result_AI21}")


In [None]:
print(f"************************************* result_markdown *************************************\n{result_markdown}")


NameError: name 'result_code' is not defined