In [1]:
import os

In [2]:
# Script to split text by pages
def split_text_by_pages(file_name, output_directory, marker="--- Page"):
    """
    Splits a text file into chunks based on the page marker and saves them in a specified directory.
    
    :param file_name: Name of the text file to process.
    :param output_directory: Directory where the chunks will be saved.
    :param marker: Marker to separate pages.
    :return: List of text chunks by page.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    with open(file_name, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Split text using the marker
    pages = text.split(marker)
    pages = [marker + page.strip() for page in pages if page.strip()]  # Keep the marker
    
    # Save each page to the output directory
    for i, page in enumerate(pages, start=1):
        output_path = os.path.join(output_directory, f'page_{i}.txt')
        with open(output_path, 'w', encoding='utf-8') as output:
            output.write(page)
    
    return pages

# Use the function
file_name = 'data/extracted_text.txt'
output_directory = 'chunks'  # Directory to save the chunks
pages = split_text_by_pages(file_name, output_directory)

# Print the total number of pages processed
print(f"Total pages processed and saved in '{output_directory}': {len(pages)}")


Total pages processed and saved in 'chunks': 14
