In [109]:
import os
import re

def clean_latex_file(input_path):
    r"""
    Cleans a LaTeX file by removing comments, extracting content within \begin{document} and \end{document},
    and converting sections, figures, itemize/enumerate, equations, tables, etc. into a Markdown-friendly format.
    
    Changes from previous versions:
    - Ensures that \label{...} commands for tables are included on the same line as the caption, just like for figures.
    - If table conversion fails, the original LaTeX code is presented in a code block without captions and labels, even if the captions contain nested braces.
    """

    if not input_path.endswith('.tex'):
        raise ValueError("Input file must have a .tex extension.")

    base, _ = os.path.splitext(input_path)
    output_path = f"{base}.txt"

    try:
        with open(input_path, 'r', encoding='utf-8') as infile:
            content = infile.read()
    except FileNotFoundError:
        print(f"Error: The file {input_path} does not exist.")
        return
    except Exception as e:
        print(f"An error occurred: {e}")
        return

    def preprocess_abstract(text):
        text = re.sub(r'\\begin\{abstract\}', r'\\section*{Abstract}', text)
        text = re.sub(r'\\end\{abstract\}', '', text)
        return text

    content = preprocess_abstract(content)

    # Extract between \begin{document} and \end{document}, if present
    doc_match = re.search(r'\\begin\{document\}(.*?)\\end\{document\}', content, flags=re.DOTALL)
    if doc_match:
        content = doc_match.group(1)

    # Remove comments (unescaped %)
    content = re.sub(r'(?<!\\)%.*', '', content)

    # Normalize whitespace
    content = re.sub(r'\r\n', '\n', content)
    content = re.sub(r'\n{3,}', '\n\n', content)
    content = content.replace('\n\n', '<<<PARA_BREAK>>>')
    content = re.sub(r'\n', ' ', content)
    content = re.sub(r'\s+', ' ', content).strip()
    content = content.replace('<<<PARA_BREAK>>>', '\n\n')

    def apply_inline_formats(text):
        # \emph{...}, \textit{...} -> *...*
        emph_pattern = re.compile(r'\\(?:emph|textit)\{(.*?)\}')
        text = emph_pattern.sub(lambda m: "*" + m.group(1).strip() + "*", text)

        # \textbf{...} -> **...**
        bold_pattern = re.compile(r'\\textbf\{(.*?)\}')
        text = bold_pattern.sub(lambda m: "**" + m.group(1).strip() + "**", text)

        # \textsc{...} -> `...`
        textsc_pattern = re.compile(r'\\textsc\{(.*?)\}')
        text = textsc_pattern.sub(lambda m: "`" + m.group(1).strip() + "`", text)

        return text

    def replace_figures(text):
        figure_env = re.compile(r'\\begin\{figure.*?\}(.*?)\\end\{figure.*?\}', flags=re.DOTALL)

        def figure_repl(m):
            inner = m.group(1)
            captions = []
            labels = []

            # Extract captions with nested brace handling
            cap_start_pattern = re.compile(r'\\caption\{')
            pos = 0
            while True:
                cmatch = cap_start_pattern.search(inner, pos)
                if not cmatch:
                    break
                start = cmatch.end()
                brace_level = 1
                i = start
                while i < len(inner) and brace_level > 0:
                    if inner[i] == '{':
                        brace_level += 1
                    elif inner[i] == '}':
                        brace_level -= 1
                    i += 1
                if brace_level == 0:
                    caption_text = inner[start:i-1].strip()
                    captions.append(caption_text)
                    pos = i
                else:
                    # Unbalanced braces
                    print("Warning: Unbalanced braces in figure caption.")
                    caption_text = inner[start:].strip()
                    captions.append(caption_text)
                    break

            full_caption = ' '.join(captions).strip()
            full_caption = apply_inline_formats(full_caption)

            # Extract labels
            label_pattern = re.compile(r'\\label\{([^}]+)\}')
            labels = label_pattern.findall(inner)

            figure_markdown = "\n\n**Figure:** " + full_caption
            for label in labels:
                figure_markdown += f" \\label{{{label}}}"
            figure_markdown += "\n\n"

            return figure_markdown

        return figure_env.sub(figure_repl, text)

    content = replace_figures(content)

    def convert_tabular_to_markdown(inner):
        try:
            # Remove scalebox if present
            while True:
                scalebox_match = re.search(r'\\scalebox\{[^\}]*\}\{', inner)
                if not scalebox_match:
                    break
                inner = re.sub(r'\\scalebox\{[^\}]*\}\{(.*?)\}', r'\1', inner, flags=re.DOTALL)

            tabular_env = re.compile(r'\\begin\{tabular\}\{.*?\}(.*?)\\end\{tabular\}', flags=re.DOTALL)
            tmatch = tabular_env.search(inner)
            if not tmatch:
                return ""

            tabular_content = tmatch.group(1)

            # Remove booktabs lines
            tabular_content = re.sub(r'\\toprule', '', tabular_content)
            tabular_content = re.sub(r'\\midrule', '', tabular_content)
            tabular_content = re.sub(r'\\bottomrule', '', tabular_content)
            tabular_content = re.sub(r'\\cmidrule\{[^\}]*\}', '', tabular_content)

            rows = re.split(r'\\\\', tabular_content)
            rows = [r.strip() for r in rows if r.strip()]

            if not rows:
                return ""

            table_rows = []
            for r in rows:
                r = re.sub(r'\\textcolor\{[^\}]*\}\{(.*?)\}', r'\1', r)
                r = re.sub(r'\\textbf\{(.*?)\}', r'**\1**', r)
                r = re.sub(r'\\emph\{(.*?)\}', r'*\1*', r)

                # Replace escaped chars
                r = r.replace(r'\&', '&').replace(r'\\', '\\')

                cells = [c.strip() for c in r.split('&')]
                table_rows.append(cells)

            num_cols = len(table_rows[0])

            md_table = "\n\n| " + " | ".join(table_rows[0]) + " |\n"
            md_table += "| " + " | ".join(["---"] * num_cols) + " |\n"
            for row in table_rows[1:]:
                if len(row) < num_cols:
                    row += [""] * (num_cols - len(row))
                elif len(row) > num_cols:
                    row = row[:num_cols]
                md_table += "| " + " | ".join(row) + " |\n"
            md_table += "\n\n"

            return md_table
        except Exception as e:
            print(f"Error during tabular conversion: {e}")
            return ""

    def remove_captions_and_labels(tex):
        r"""
        Remove all \caption{...} commands (with possible nested braces)
        and all \label\{...\} commands from the given LaTeX code.
        """
        # Remove captions with nested braces
        out = ""
        start_idx = 0
        caption_pattern = re.compile(r'\\caption\{')
        while True:
            cmatch = caption_pattern.search(tex, start_idx)
            if not cmatch:
                # no more captions
                break
            out += tex[start_idx:cmatch.start()]
            # find matching braces
            pos = cmatch.end()
            brace_level = 1
            while pos < len(tex) and brace_level > 0:
                if tex[pos] == '{':
                    brace_level += 1
                elif tex[pos] == '}':
                    brace_level -= 1
                pos += 1
            # skip this entire caption block
            start_idx = pos
        out += tex[start_idx:]

        # Now remove labels
        out = re.sub(r'\\label\{[^}]+\}', '', out)
        return out

    def replace_tables(text):
        table_env = re.compile(r'(\\begin\{table.*?\}.*?\\end\{table.*?\})', flags=re.DOTALL)

        def table_repl(m):
            entire_table = m.group(1)

            # Extract captions with nested brace handling
            captions = []
            pos = 0
            cap_start_pattern = re.compile(r'\\caption\{')
            while True:
                cmatch = cap_start_pattern.search(entire_table, pos)
                if not cmatch:
                    break
                start = cmatch.end()
                brace_level = 1
                i = start
                while i < len(entire_table) and brace_level > 0:
                    if entire_table[i] == '{':
                        brace_level += 1
                    elif entire_table[i] == '}':
                        brace_level -= 1
                    i += 1
                if brace_level == 0:
                    caption_text = entire_table[start:i-1].strip()
                    captions.append(caption_text)
                    pos = i
                else:
                    print("Warning: Unbalanced braces in table caption.")
                    caption_text = entire_table[start:].strip()
                    captions.append(caption_text)
                    break

            full_caption = ' '.join(captions).strip()
            full_caption = apply_inline_formats(full_caption)

            # Extract labels
            label_pattern = re.compile(r'\\label\{([^}]+)\}')
            labels = label_pattern.findall(entire_table)

            # Convert tabular to markdown
            markdown_table = convert_tabular_to_markdown(entire_table)

            if not markdown_table:
                # Conversion failed
                print("Warning: Table conversion failed. Retaining original LaTeX table without captions and labels.")
                cleaned_table = remove_captions_and_labels(entire_table)
                # Append labels to the caption
                labels_str = ' '.join([f"\\label{{{label}}}" for label in labels])
                return f"\n\n**Table:** {full_caption} {labels_str}\n\n```latex\n{cleaned_table}\n```\n\n"

            # Conversion succeeded
            # Append labels to the caption
            labels_str = ' '.join([f"\\label{{{label}}}" for label in labels])
            table_markdown = f"\n\n**Table:** {full_caption} {labels_str}\n\n{markdown_table}\n"

            return table_markdown

        return table_env.sub(table_repl, text)

    content = replace_tables(content)

    def replace_equations(text):
        eq_env = re.compile(r'\\begin\{equation\}(.*?)\\end\{equation\}', flags=re.DOTALL)

        def eq_repl(m):
            eq_text = m.group(1).strip()
            eq_text = re.sub(r'(\\label\{[^}]+\})\s*', r'\1\n', eq_text)
            return "\n\n$$\n" + eq_text + "\n$$\n\n"

        return eq_env.sub(eq_repl, text)

    content = replace_equations(content)

    def replace_headings(text):
        sec_pattern = re.compile(
            r'\\(section|subsection|subsubsection|paragraph|runningtitle)\*?\{(.*?)\}'
            r'(?:\s*\\label\{([^}]+)\})?', flags=re.DOTALL
        )

        def sec_repl(m):
            level_map = {
                "section": 1,
                "subsection": 2,
                "subsubsection": 3,
                "paragraph": 4,
                "runningtitle": 1
            }
            level = level_map.get(m.group(1), 2)
            title = m.group(2).strip()
            label = m.group(3)

            markdown = "\n\n" + "#"*level + " " + title
            if label:
                markdown += "\n\\label{" + label + "}"
            markdown += "\n\n"

            return markdown

        return sec_pattern.sub(sec_repl, text)

    content = replace_headings(content)

    def replace_lists(text):
        enum_env = re.compile(r'\\begin\{enumerate\}(\[[^\]]*\])?(.*?)\\end\{enumerate\}', flags=re.DOTALL)
        def enum_repl(m):
            inner = m.group(2)
            items = re.split(r'\\item', inner)
            items = [i.strip() for i in items if i.strip()]
            result = "\n\n"
            for idx, it in enumerate(items, start=1):
                it = apply_inline_formats(it)
                result += f"{idx}. {it}\n"
            result += "\n"
            return result

        text = enum_env.sub(enum_repl, text)

        item_env = re.compile(r'\\begin\{itemize\}(\[[^\]]*\])?(.*?)\\end\{itemize\}', flags=re.DOTALL)
        def item_repl(m):
            inner = m.group(2)
            items = re.split(r'\\item', inner)
            items = [i.strip() for i in items if i.strip()]
            result = "\n\n"
            for it in items:
                it = apply_inline_formats(it)
                result += f"- {it}\n"
            result += "\n"
            return result

        text = item_env.sub(item_repl, text)
        text = re.sub(r'\\item\s+', '\n- ', text)

        return text

    content = replace_lists(content)

    def replace_inline_formats_func(text):
        return apply_inline_formats(text)

    content = replace_inline_formats_func(content)

    def remove_leftover_commands(text):
        commands_to_remove = ['vspace', 'hspace', 'bigskip', 'smallskip', 'medskip']
        commands_to_replace_newline = ['newpage', 'pagebreak', 'linebreak', 'clearpage', 'cleardoublepage']

        remove_pattern = re.compile(
            r'\\(?:' + '|'.join(commands_to_remove) + r')(?:\[[^\]]*\])?(?:\{[^}]*\})?', flags=re.DOTALL
        )
        replace_newline_pattern = re.compile(
            r'\\(?:' + '|'.join(commands_to_replace_newline) + r')(?:\[[^\]]*\])?(?:\{[^}]*\})?',
            flags=re.DOTALL
        )

        text = remove_pattern.sub('', text)
        text = replace_newline_pattern.sub('\n\n', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text

    content = remove_leftover_commands(content)

    def remove_formatting_cmds(text):
        formatting_cmds = re.compile(
            r'\\(vspace|hspace|bigskip|newpage|smallskip|medskip|pagebreak|linebreak|clearpage|cleardoublepage)'
            r'(\[[^\]]*\])?(\{[^}]*\})?'
        )
        return formatting_cmds.sub(' ', text)

    content = remove_formatting_cmds(content)

    content = re.sub(r'\n\n\s+', '\n\n', content)

    def final_cleanup(text):
        text = re.sub(r' {2,}', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text.strip() + "\n"

    content = final_cleanup(content)

    try:
        with open(output_path, 'w', encoding='utf-8') as outfile:
            outfile.write(content)
        print(f"Cleaned file has been written to: {output_path}")
    except Exception as e:
        print(f"An error occurred while writing the output file: {e}")


In [110]:
input_file = './maintext.tex'  # Replace with your .tex file path
clean_latex_file(input_file)

Cleaned file has been written to: ./maintext.txt
