In [2]:
import os
import re

def clean_latex_file(input_path):
    """
    Cleans a LaTeX file by removing comments, extracting content within \\begin{document} and \\end{document},
    and processing figure environments to retain only captions labeled as "Figure".

    Parameters:
    - input_path (str): Path to the input .tex file.

    Output:
    - A .txt file with the cleaned content in the same directory as the input file.
    """
    if not input_path.endswith('.tex'):
        raise ValueError("Input file must have a .tex extension.")

    # Determine output path
    base, _ = os.path.splitext(input_path)
    output_path = f"{base}.txt"

    # Regular expressions
    begin_doc_pattern = re.compile(r'\\begin\{document\}')
    end_doc_pattern = re.compile(r'\\end\{document\}')
    begin_fig_pattern = re.compile(r'\\begin\{figure(?:\*?)\}')
    end_fig_pattern = re.compile(r'\\end\{figure(?:\*?)\}')
    caption_pattern = re.compile(r'\\caption\{(.+?)\}')

    try:
        with open(input_path, 'r', encoding='utf-8') as infile, \
             open(output_path, 'w', encoding='utf-8') as outfile:

            in_document = False
            in_figure = False

            for line_number, line in enumerate(infile, start=1):
                stripped_line = line.strip()

                # Check for \begin{document}
                if not in_document:
                    if begin_doc_pattern.search(stripped_line):
                        in_document = True
                    continue  # Skip lines until \begin{document}

                # Check for \end{document}
                if end_doc_pattern.search(stripped_line):
                    in_document = False
                    break  # Stop processing after \end{document}

                # Handle figure environments
                if in_figure:
                    if end_fig_pattern.search(stripped_line):
                        in_figure = False
                    else:
                        # Look for \caption{...}
                        caption_match = caption_pattern.search(stripped_line)
                        if caption_match:
                            caption_text = caption_match.group(1).strip()
                            outfile.write(f"Figure: {caption_text}\n")
                    continue  # Skip processing other content within figure

                # Check if the line starts a figure environment
                if begin_fig_pattern.search(stripped_line):
                    in_figure = True
                    continue  # Skip the \begin{figure} line

                # Remove comments: Unescaped %
                # Split the line at the first unescaped %
                comment_split = re.split(r'(?<!\\)%', line, maxsplit=1)
                code_line = comment_split[0]

                # Replace escaped percent signs with actual percent
                code_line = code_line.replace(r'\%', '%')

                # Strip leading and trailing whitespace
                code_line = code_line.strip()

                if code_line:
                    outfile.write(code_line + '\n')

        print(f"Cleaned file has been written to: {output_path}")

    except FileNotFoundError:
        print(f"Error: The file {input_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [8]:
input_file = './maintext.tex'  # Replace with your .tex file path
clean_latex_file(input_file)

Cleaned file has been written to: ./maintext.txt
