### Load arxiv text and clean it

In [7]:
import re
def read_latex_as_txt(filepath):
    """
    Reads a LaTeX file and returns its content as a plain text string.

    Args:
        filepath (str): The path to the LaTeX file.

    Returns:
        str: The content of the LaTeX file as a plain text string.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return f"Error: File not found at {filepath}"
    except Exception as e:
        return f"An error occurred: {e}"
    
latex_content = read_latex_as_txt('../data/arxiv/DPO.txt')
print(latex_content)

# Introduction

Large unsupervised language models (LMs) trained on very large datasets
acquire surprising
capabilities [@chowdhery2022palm; @brown2020language; @touvron2023llama; @bubeck2023sparks].
However, these models are trained on data generated by humans with a
wide variety of goals, priorities, and skillsets. Some of these goals
and skillsets may not be desirable to imitate; for example, while we may
want our AI coding assistant to *understand* common programming mistakes
in order to correct them, nevertheless, when generating code, we would
like to bias our model toward the (potentially rare) high-quality coding
ability present in its training data. Similarly, we might want our
language model to be *aware* of a common misconception believed by 50%
of people, but we certainly do not want the model to claim this
misconception to be true in 50% of queries about it! In other words,
selecting the model's *desired responses and behavior* from its very
wide *knowledge and abilities* 

In [3]:
# Clean arxiv_paper via. LLM-generated regex-filtering code

import re

def clean_latex_to_markdown(text):
    """
    Clean LaTeX formatting and convert to markdown while preserving structure.
    """
    # Store the original text for processing
    cleaned_text = text
    
    # 1. Remove citation brackets like [@author2020paper; @another2021paper]
    cleaned_text = re.sub(r'\[@[^\]]+\]', '', cleaned_text)
    
    # 2. Remove LaTeX labels
    cleaned_text = re.sub(r'\\label\{[^}]+\}', '', cleaned_text)
    
    # 3. Clean up equation references - convert complex refs to simple format
    # Pattern like: Eq.[\[eq:RL\]](#eq:RL){reference-type="ref" reference="eq:RL"}
    cleaned_text = re.sub(
        r'Eq\.\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Eq. (\1)', 
        cleaned_text
    )
    
    # 4. Clean up section references
    # Pattern like: Section[\[sec:theory\]](#sec:theory){reference-type="ref" reference="sec:theory"}
    cleaned_text = re.sub(
        r'Section\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Section \1', 
        cleaned_text
    )
    
    # 5. Clean up appendix references 
    # Pattern like: Appendix[\[app:derivation1\]](#app:derivation1){reference-type="ref" reference="app:derivation1"}
    cleaned_text = re.sub(
        r'Appendix\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Appendix', 
        cleaned_text
    )
    
    # 6. Clean up figure references
    # Pattern like: Figure[\[fig:frontier-tldr-main\]](#fig:frontier-tldr-main){reference-type="ref" reference="fig:frontier-tldr-main"}
    cleaned_text = re.sub(
        r'Figure\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Figure \1', 
        cleaned_text
    )
    
    # 7. Clean up table references
    cleaned_text = re.sub(
        r'Table\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Table \1', 
        cleaned_text
    )
    
    # 8. Remove complex figure blocks but preserve descriptive text
    # Pattern like: ![ **optimizes for human preferences...] 
    figure_pattern = r'!\[\s*\*\*([^*]+)\*\*([^\]]*)\]'
    cleaned_text = re.sub(figure_pattern, r'**Figure: \1**\2', cleaned_text)
    
    # 9. Clean up LaTeX text formatting commands
    cleaned_text = re.sub(r'\\text\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textrm\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', cleaned_text)
    cleaned_text = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', cleaned_text)
    cleaned_text = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', cleaned_text)
    
    # 10. Clean up LaTeX math commands in text (but preserve $$ blocks)
    # Remove \mid and similar in inline contexts
    cleaned_text = re.sub(r'\\mid(?![^$]*\$\$)', '|', cleaned_text)
    
    # 11. Remove LaTeX section numbering artifacts
    cleaned_text = re.sub(r'\{#[^}]+\}', '', cleaned_text)
    
    # 12. Clean up reference artifacts like {reference-type="ref" reference="..."}
    cleaned_text = re.sub(r'\{[^}]*reference-type[^}]*\}', '', cleaned_text)
    
    # 13. Clean up footnote markers like [^2], [^3] etc.
    cleaned_text = re.sub(r'\[\^[0-9]+\]', '', cleaned_text)
    
    # 14. Remove LaTeX environments that aren't math
    # Remove \begin{...} and \end{...} for non-math environments
    cleaned_text = re.sub(r'\\begin\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    cleaned_text = re.sub(r'\\end\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    
    # 15. Clean up definition and lemma blocks - convert to markdown
    # Pattern like: ::: definition ... :::
    cleaned_text = re.sub(r':::\s*definition\s*\n\*\*Definition\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Definition \1:** \2', cleaned_text, flags=re.DOTALL)
    
    cleaned_text = re.sub(r':::\s*\{#[^}]+\s+\.lemma\}\s*\n\*\*Lemma\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Lemma \1:** \2', cleaned_text, flags=re.DOTALL)
    
    # 16. Remove remaining LaTeX artifacts
    cleaned_text = re.sub(r'\\[a-zA-Z]+\*?', '', cleaned_text)  # Remove LaTeX commands
    cleaned_text = re.sub(r'\{[^}]*\}(?![^$]*\$\$)', '', cleaned_text)  # Remove remaining braces outside math
    
    # 17. Clean up multiple spaces and empty lines
    cleaned_text = re.sub(r' +', ' ', cleaned_text)  # Multiple spaces to single
    cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text)  # Multiple newlines to double
    
    # 18. Fix any broken markdown headers
    cleaned_text = re.sub(r'^#+\s*$', '', cleaned_text, flags=re.MULTILINE)
    
    return cleaned_text.strip()

# Clean the text
cleaned_paper = clean_latex_to_markdown(latex_content)
print(cleaned_paper)




numbers, compressnatbib











 
    
       
            
       
       
       
      
         










theoremTheorem
corollaryCorollary
propositionProposition
lemmaLemma
definitionDefinition








 


















Direct Preference Optimization:
 Your Language Model is Secretly a Reward Model
    
  Rafael RafailovEqual contribution; more junior authors listed earlier.  [2] Archit Sharma[1]  [2] Eric Mitchell[1]  [2] 

  Stefano Ermon[2]  [3] Christopher D. Manning[2] Chelsea Finn[2] 

  [2]  Stanford University [3]  CZ Biohub 

  

    June 19, 2025




While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training.
Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with re

In [13]:
def clean_latex_to_markdown(text):
    """
    Clean LaTeX formatting and convert to markdown while preserving structure.
    """
    # Store the original text for processing
    cleaned_text = text
    
    # 1. Remove citation brackets like [@author2020paper; @another2021paper]
    cleaned_text = re.sub(r'\[@[^\]]+\]', '', cleaned_text)
    
    # 2. Remove LaTeX labels
    cleaned_text = re.sub(r'\\label\{[^}]+\}', '', cleaned_text)
    
    # 3. Clean up equation references - convert complex refs to simple format
    # Pattern like: Eq.[\[eq:RL\]](#eq:RL){reference-type="ref" reference="eq:RL"}
    cleaned_text = re.sub(
        r'Eq\.\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Eq. (\1)', 
        cleaned_text
    )
    
    # 4. Clean up section references
    # Pattern like: Section[\[sec:theory\]](#sec:theory){reference-type="ref" reference="sec:theory"}
    cleaned_text = re.sub(
        r'Section\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Section \1', 
        cleaned_text
    )
    
    # 5. Clean up appendix references 
    # Pattern like: Appendix[\[app:derivation1\]](#app:derivation1){reference-type="ref" reference="app:derivation1"}
    cleaned_text = re.sub(
        r'Appendix\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Appendix', 
        cleaned_text
    )
    
    # 6. Clean up figure references
    # Pattern like: Figure[\[fig:frontier-tldr-main\]](#fig:frontier-tldr-main){reference-type="ref" reference="fig:frontier-tldr-main"}
    cleaned_text = re.sub(
        r'Figure\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Figure \1', 
        cleaned_text
    )
    
    # 7. Clean up table references
    cleaned_text = re.sub(
        r'Table\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Table \1', 
        cleaned_text
    )
    
    # 8. Remove complex figure blocks but preserve descriptive text
    # Pattern like: ![ **optimizes for human preferences...] 
    figure_pattern = r'!\[\s*\*\*([^*]+)\*\*([^\]]*)\]'
    cleaned_text = re.sub(figure_pattern, r'**Figure: \1**\2', cleaned_text)
    
    # 9. Clean up LaTeX text formatting commands
    cleaned_text = re.sub(r'\\text\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textrm\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', cleaned_text)
    cleaned_text = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', cleaned_text)
    cleaned_text = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', cleaned_text)
    
    # 10. Clean up LaTeX math commands in text (but preserve $$ blocks)
    # Remove \mid and similar in inline contexts
    cleaned_text = re.sub(r'\\mid(?![^$]*\$\$)', '|', cleaned_text)
    
    # 11. Clean up reference artifacts like {reference-type="ref" reference="..."}
    cleaned_text = re.sub(r'\{[^}]*reference-type[^}]*\}', '', cleaned_text)
    
    # 12. Clean up footnote markers like [^2], [^3] etc.
    cleaned_text = re.sub(r'\[\^[0-9]+\]', '', cleaned_text)
    
    # 13. Remove LaTeX environments that aren't math
    # Remove \begin{...} and \end{...} for non-math environments
    cleaned_text = re.sub(r'\\begin\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    cleaned_text = re.sub(r'\\end\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    
    # 14. Clean up definition and lemma blocks - convert to markdown
    # Pattern like: ::: definition ... :::
    cleaned_text = re.sub(r':::\s*definition\s*\n\*\*Definition\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Definition \1:** \2', cleaned_text, flags=re.DOTALL)
    
    cleaned_text = re.sub(r':::\s*\{#[^}]+\s+\.lemma\}\s*\n\*\*Lemma\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Lemma \1:** \2', cleaned_text, flags=re.DOTALL)
    
    # 15. Clean up theorem blocks
    cleaned_text = re.sub(r':::\s*\{#[^}]+\s+\.theorem\}\s*\n\*\*Theorem\s+(\d+)\*\*\.([^:]+):::',
                         r'**Theorem \1:** \2', cleaned_text, flags=re.DOTALL)

    # 16. Remove pandoc header attributes like `{#...}`
    cleaned_text = re.sub(r'\s*\{#[^}]+\}\s*$', '', cleaned_text, flags=re.MULTILINE)

    # 17. Remove remaining LaTeX artifacts
    cleaned_text = re.sub(r'\\[a-zA-Z]+\*?', '', cleaned_text)  # Remove LaTeX commands
    cleaned_text = re.sub(r'\{[^}]*\}(?![^$]*\$\$)', '', cleaned_text)  # Remove remaining braces outside math
    
    # 18. Join paragraphs by replacing single newlines with spaces, but preserving paragraph breaks (double newlines).
    cleaned_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', cleaned_text)

    # 19. Clean up multiple spaces and empty lines
    cleaned_text = re.sub(r' +', ' ', cleaned_text)  # Multiple spaces to single
    cleaned_text = re.sub(r'\n\s*\n+', '\n\n', cleaned_text)  # Multiple newlines to double
    
    # 20. Fix any broken markdown headers
    cleaned_text = re.sub(r'^#+\s*$', '', cleaned_text, flags=re.MULTILINE)
    
    return cleaned_text.strip()

import textwrap

def print_wrapped(text, width=100):
    """
    Prints the given text wrapped to a specified width for better readability in notebooks.
    This function preserves paragraph breaks.
    """
    paragraphs = text.split('\n\n')
    for para in paragraphs:
        print(textwrap.fill(para, width=width))
        print()
        
cleaned_paper = clean_latex_to_markdown(latex_content)
cleaned_paper = cleaned_paper.split('# Acknowledgements')[0].strip()
print(cleaned_paper)

# Introduction

Large unsupervised language models (LMs) trained on very large datasets acquire surprising capabilities . However, these models are trained on data generated by humans with a wide variety of goals, priorities, and skillsets. Some of these goals and skillsets may not be desirable to imitate; for example, while we may want our AI coding assistant to *understand* common programming mistakes in order to correct them, nevertheless, when generating code, we would like to bias our model toward the (potentially rare) high-quality coding ability present in its training data. Similarly, we might want our language model to be *aware* of a common misconception believed by 50% of people, but we certainly do not want the model to claim this misconception to be true in 50% of queries about it! In other words, selecting the model's *desired responses and behavior* from its very wide *knowledge and abilities* is crucial to building AI systems that are safe, performant, and controllable 

Let's improve this.

In [26]:
import re
import textwrap

def remove_special_blocks(text):
    """Removes larger, special formatted blocks from the text."""
    # Remove complex figure blocks and image links, including ![image](...)
    cleaned_text = re.sub(r'!\[.*?\]\(.*?\)', '', text)

    # Remove LaTeX environments that aren't math
    # cleaned_text = re.sub(r'\\begin\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    # cleaned_text = re.sub(r'\\end\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)

    return cleaned_text

def clean_inline_formatting(text):
    """Cleans inline LaTeX commands and other formatting."""
    cleaned_text = text
    
    # Remove citation brackets like [@author2020paper; @another2021paper]
    cleaned_text = re.sub(r'\n\[@[^\]]+\]\n', '', cleaned_text)
    cleaned_text = re.sub(r'\n\[@[^\]]+\]', '', cleaned_text)
    cleaned_text = re.sub(r'\[@[^\]]+\]\n', '', cleaned_text)
    cleaned_text = re.sub(r'\[@[^\]]+\]', '', cleaned_text)

    # Remove LaTeX labels
    cleaned_text = re.sub(r'\\label\{[^}]+\}', '', cleaned_text)
    
    # Clean up equation references
    cleaned_text = re.sub(
        r'Eq\.\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Eq. (\1)', 
        cleaned_text
    )
    
    # Clean up section, figure, and table references
    for ref_type in ['Section', 'Figure', 'Table']:
        cleaned_text = re.sub(
            r'{}\[\\?\[([^\]]+)\]\]\([^)]*\)\{{[^}}]*\}}'.format(ref_type),
            r'{} \1'.format(ref_type),
            cleaned_text
        )
    
    # Clean up appendix references
    cleaned_text = re.sub(
        r'Appendix\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Appendix', 
        cleaned_text
    )
    
    # Clean up LaTeX text formatting commands
    cleaned_text = re.sub(r'\\text\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textrm\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', cleaned_text)
    cleaned_text = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', cleaned_text)
    cleaned_text = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', cleaned_text)
    
    # Clean up various artifacts
    cleaned_text = re.sub(r'\{[^}]*reference-type[^}]*\}', '', cleaned_text)
    cleaned_text = re.sub(r'\[\^[0-9]+\]', '', cleaned_text) # Footnotes
    cleaned_text = re.sub(r'\s*\{#[^}]+\}\s*$', '', cleaned_text, flags=re.MULTILINE) # Pandoc headers

    # Remove remaining LaTeX commands, and any resulting empty braces/brackets
    # cleaned_text = re.sub(r'\\[a-zA-Z]+\*?', '', cleaned_text)
    cleaned_text = re.sub(r'\{\s*\}', '', cleaned_text)
    cleaned_text = re.sub(r'\{width=[^}]*\}', '', cleaned_text)
    cleaned_text = re.sub(r'\[\s*\]', '', cleaned_text)

    return cleaned_text

def normalize_whitespace(text):
    """Normalizes whitespace and paragraph breaks."""

    # Clean up multiple spaces and empty lines
    cleaned_text = re.sub(r' +', ' ', text)
    cleaned_text = re.sub(r'\n\s*\n+', '\n\n', cleaned_text)
    
    # Fix any broken markdown headers
    cleaned_text = re.sub(r'^#+\s*$', '', cleaned_text, flags=re.MULTILINE)
    
    return cleaned_text.strip()

def clean_latex_to_markdown(text):
    """
    Clean LaTeX formatting and convert to markdown while preserving structure.
    """
    # Join paragraphs by replacing single newlines with spaces
    cleaned_text = re.sub(r'(?<!\n)\n(?!\n)', '', text)
    cleaned_text = remove_special_blocks(text)
    cleaned_text = clean_inline_formatting(cleaned_text)
    cleaned_text = normalize_whitespace(cleaned_text)
    return cleaned_text
        
cleaned_paper = clean_latex_to_markdown(latex_content)
# Remove acknowledgements and everything after
cleaned_paper = cleaned_paper.split('# Acknowledgements')[0].strip()
print_wrapped(cleaned_paper)

# Introduction

Large unsupervised language models (LMs) trained on very large datasets acquire surprising
capabilities . However, these models are trained on data generated by humans with a wide variety of
goals, priorities, and skillsets. Some of these goals and skillsets may not be desirable to imitate;
for example, while we may want our AI coding assistant to *understand* common programming mistakes
in order to correct them, nevertheless, when generating code, we would like to bias our model toward
the (potentially rare) high-quality coding ability present in its training data. Similarly, we might
want our language model to be *aware* of a common misconception believed by 50% of people, but we
certainly do not want the model to claim this misconception to be true in 50% of queries about it!
In other words, selecting the model's *desired responses and behavior* from its very wide *knowledge
and abilities* is crucial to building AI systems that are safe, performant, and controllable 

### Final version of the clean function used in experiments

In [None]:
def clean_latex_to_markdown(text):
    """
    Clean LaTeX formatting and convert to markdown while preserving structure.
    """
    # Store the original text for processing
    cleaned_text = text
    
    # 1. Remove citation brackets like [@author2020paper; @another2021paper]
    cleaned_text = re.sub(r'\[@[^\]]+\]', '', cleaned_text)
    
    # 2. Remove LaTeX labels
    cleaned_text = re.sub(r'\\label\{[^}]+\}', '', cleaned_text)
    
    # 3. Clean up equation references - convert complex refs to simple format
    # Pattern like: Eq.[\[eq:RL\]](#eq:RL){reference-type="ref" reference="eq:RL"}
    cleaned_text = re.sub(
        r'Eq\.\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Eq. (\1)', 
        cleaned_text
    )
    
    # 4. Clean up section references
    # Pattern like: Section[\[sec:theory\]](#sec:theory){reference-type="ref" reference="sec:theory"}
    cleaned_text = re.sub(
        r'Section\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Section \1', 
        cleaned_text
    )
    
    # 5. Clean up appendix references 
    # Pattern like: Appendix[\[app:derivation1\]](#app:derivation1){reference-type="ref" reference="app:derivation1"}
    cleaned_text = re.sub(
        r'Appendix\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Appendix', 
        cleaned_text
    )
    
    # 6. Clean up figure references
    # Pattern like: Figure[\[fig:frontier-tldr-main\]](#fig:frontier-tldr-main){reference-type="ref" reference="fig:frontier-tldr-main"}
    cleaned_text = re.sub(
        r'Figure\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Figure \1', 
        cleaned_text
    )
    
    # 7. Clean up table references
    cleaned_text = re.sub(
        r'Table\[\\?\[([^\]]+)\]\]\([^)]*\)\{[^}]*\}', 
        r'Table \1', 
        cleaned_text
    )
    
    # 8. Remove complex figure blocks but preserve descriptive text
    # Pattern like: ![ **optimizes for human preferences...] 
    figure_pattern = r'!\[\s*\*\*([^*]+)\*\*([^\]]*)\]'
    cleaned_text = re.sub(figure_pattern, r'**Figure: \1**\2', cleaned_text)
    
    # 9. Clean up LaTeX text formatting commands
    cleaned_text = re.sub(r'\\text\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textrm\{([^}]+)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', cleaned_text)
    cleaned_text = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', cleaned_text)
    cleaned_text = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', cleaned_text)
    
    # 10. Clean up LaTeX math commands in text (but preserve $$ blocks)
    # Remove \mid and similar in inline contexts
    cleaned_text = re.sub(r'\\mid(?![^$]*\$\$)', '|', cleaned_text)
    
    # 11. Remove LaTeX section numbering artifacts
    cleaned_text = re.sub(r'\{#[^}]+\}', '', cleaned_text)
    
    # 12. Clean up reference artifacts like {reference-type="ref" reference="..."}
    cleaned_text = re.sub(r'\{[^}]*reference-type[^}]*\}', '', cleaned_text)
    
    # 13. Clean up footnote markers like [^2], [^3] etc.
    cleaned_text = re.sub(r'\[\^[0-9]+\]', '', cleaned_text)
    
    # 14. Remove LaTeX environments that aren't math
    # Remove \begin{...} and \end{...} for non-math environments
    cleaned_text = re.sub(r'\\begin\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    cleaned_text = re.sub(r'\\end\{(?!equation|align|gather)[^}]+\}', '', cleaned_text)
    
    # 15. Clean up definition and lemma blocks - convert to markdown
    # Pattern like: ::: definition ... :::
    cleaned_text = re.sub(r':::\s*definition\s*\n\*\*Definition\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Definition \1:** \2', cleaned_text, flags=re.DOTALL)
    
    cleaned_text = re.sub(r':::\s*\{#[^}]+\s+\.lemma\}\s*\n\*\*Lemma\s+(\d+)\*\*\.([^:]+):::', 
                         r'**Lemma \1:** \2', cleaned_text, flags=re.DOTALL)
    
    # 16. Remove remaining LaTeX artifacts
    cleaned_text = re.sub(r'\\[a-zA-Z]+\*?', '', cleaned_text)  # Remove LaTeX commands
    cleaned_text = re.sub(r'\{[^}]*\}(?![^$]*\$\$)', '', cleaned_text)  # Remove remaining braces outside math
    
    # 17. Clean up multiple spaces and empty lines
    cleaned_text = re.sub(r' +', ' ', cleaned_text)  # Multiple spaces to single
    cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text)  # Multiple newlines to double
    
    # 18. Fix any broken markdown headers
    cleaned_text = re.sub(r'^#+\s*$', '', cleaned_text, flags=re.MULTILINE)
    
    return cleaned_text.strip().split('# Acknowledgements')[0].strip()
