In [1]:
import re 

# Read main.tex file from data/arxiv directory
with open('../data/arxiv/DPO.tex', 'r') as f:
    main_tex = f.read()

In [2]:

def clean_latex(latex_content: str) -> str:
    """
    Cleans a LaTeX string by extracting title, abstract, and main body,
    and removing commands and environments that don't contribute to the main text.
    """
    # 1. Extract title
    title_match = re.search(r'\\title\{(.*?)\}', latex_content, re.DOTALL)
    title = ""
    if title_match:
        title = title_match.group(1)
        title = re.sub(r'\\\\\s*', ' ', title)
        title = title.strip()

    # 2. Extract abstract
    abstract_match = re.search(r'\\begin\{abstract\}(.*?)\\end\{abstract\}', latex_content, re.DOTALL)
    abstract = abstract_match.group(1).strip() if abstract_match else ""

    # 3. Find start of main body (first section after document environment)
    body_start_index = -1
    doc_start_match = re.search(r'\\begin\{document\}', latex_content)
    if doc_start_match:
        # Find the first section after \begin{document}
        section_match = re.search(r'\\section', latex_content[doc_start_match.end():])
        if section_match:
            body_start_index = doc_start_match.end() + section_match.start()

    if body_start_index == -1:
        # Fallback if structure is unexpected, return what we have so far
        body = ""
    else:
        body = latex_content[body_start_index:]
    
    # 4. Find end of main body (before references, appendix, etc.)
    end_markers = [
        r'\\begin\{thebibliography\}', r'\\bibliography', r'\\appendix',
        r'\\section\*?\{Acknowledgements\}', r'\\section\*?\{Author Contributions\}'
    ]
    end_index = len(body)
    for marker_regex in end_markers:
        end_match = re.search(marker_regex, body)
        if end_match:
            end_index = min(end_index, end_match.start())
    
    body = body[:end_index]

    # Combine the parts we want to keep
    full_text = f'\\title{{{title}}}\n\n\\begin{{abstract}}\n{abstract}\n\\end{{abstract}}\n\n{body}\n\\end{{document}}'

    # Remove excessive new lines 
    full_text = re.sub(r'\n{3,}', '\n\n', full_text)
    
    # Now apply cleaning operations from the original function
    cleaned_text = full_text
    
    # # Environments to remove completely with their content
    # envs_to_remove = [
    #     'figure', 'figure\*', 'table', 'table\*', 'tabular', 'tabular\*', 'algorithm2e',
    #     'equation', 'equation\*', 'align', 'align\*', 'multline', 'multline\*',
    #     'wrapfigure', 'wraptable'
    # ]
    # for env in envs_to_remove:
    #     cleaned_text = re.sub(r'\\begin{' + env + r'}.*?\\end{' + env + r'}', '', cleaned_text, flags=re.DOTALL)

    # # Replace commands that have content we want to keep
    # cmds_keep_content = [
    #     'section', 'subsection', 'subsubsection', 'paragraph', 'subparagraph',
    #     'textbf', 'textit', 'emph', 'texttt', 'caption'
    # ]
    # for cmd in cmds_keep_content:
    #     cleaned_text = re.sub(r'\\' + cmd + r'\{([^}]+)\}', r'\1', cleaned_text)

    # # Handle \rev{old text}{new text} -> new text
    # cleaned_text = re.sub(r'\\rev\{[^}]*\}\{([^}]+)\}', r'\1', cleaned_text)

    # # Remove commands with arguments that we want to discard
    # cmds_remove_arg = ['label', 'ref', 'cite', 'citep', 'input', 'url']
    # for cmd in cmds_remove_arg:
    #     cleaned_text = re.sub(r'\\' + cmd + r'\{[^}]*\}', '', cleaned_text)

    # # Remove commands that don't have arguments
    # cmds_to_remove = [
    #     'maketitle', 'clearpage', 'AND', 'And', 'footnotemark', 'thanks', 'appendix', 'newpage'
    # ]
    # for cmd in cmds_to_remove:
    #     cleaned_text = re.sub(r'\\' + cmd + r'(?!\w)', '', cleaned_text)
    
    # # Remove custom commands from this specific paper
    # custom_cmds_to_remove = ['piref', 'pisft', 'methodac', 'methodfull', 'se']
    # for cmd in custom_cmds_to_remove:
    #     if '{' in cmd:
    #          cleaned_text = re.sub(r'\\' + cmd.split('{')[0] + r'\{[^}]*\}', '', cleaned_text)
    #     else:
    #         cleaned_text = re.sub(r'\\' + cmd + r'(?!\w)', '', cleaned_text)


    # # Remove environment tags but keep content
    # envs_keep_content = ['sproof'] # abstract and document are handled
    # for env in envs_keep_content:
    #     cleaned_text = re.sub(r'\\begin{' + env + r'\}', '', cleaned_text)
    #     cleaned_text = re.sub(r'\\end{' + env + r'\}', '', cleaned_text)

    # # Handle lists
    # cleaned_text = re.sub(r'\\begin{itemize}', '', cleaned_text)
    # cleaned_text = re.sub(r'\\end{itemize}', '', cleaned_text)
    # cleaned_text = re.sub(r'\\begin{enumerate}', '', cleaned_text)
    # cleaned_text = re.sub(r'\\end{enumerate}', '', cleaned_text)
    # cleaned_text = re.sub(r'\\item', '\n- ', cleaned_text)

    # # Remove comments
    # cleaned_text = re.sub(r'%.*', '', cleaned_text)
    
    # # Remove inline math expressions
    # cleaned_text = re.sub(r'\$.*?\$', '', cleaned_text)
    
    # # Clean up whitespace
    # cleaned_text = re.sub(r'~', ' ', cleaned_text)
    # cleaned_text = re.sub(r'\\ ', ' ', cleaned_text) # Explicit space command
    # cleaned_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', cleaned_text)
    # cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)  # Collapse multiple newlines
    # cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)  # Collapse multiple spaces
    # cleaned_text = cleaned_text.strip()

    return cleaned_text.strip()

In [3]:
cleaned_text = clean_latex(main_tex)
print(cleaned_text)

# Save the cleaned text
with open('../data/arxiv/cleaned_DPO.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

\title{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}

\begin{abstract}
While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training.
Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF).
However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model.
\rev{In this paper, we leverage a mapping between reward functions and optimal policies to show that this constrained reward maximization proble

### Can we clean it more?

In [65]:
import re

def fix_newlines_final(text: str) -> str:
    """
    Fixes misplaced newlines with a strict set of rules:
    - Finds any two non-whitespace chunks separated by clean whitespace.
    - A fix is only applied IF AND ONLY IF both chunks are purely alphabetic.
    """
    # A general pattern to find candidates: two non-whitespace chunks
    # separated by whitespace that does NOT contain a backslash.
    # [^\\s] is a negated set: any character that is NOT a backslash or whitespace.
    # We use this to avoid matching the '\\' from a LaTeX line break.
    pattern = r'(\S+)([ \t]*\n[ \t\n]*)(\S+)'
    def conditional_replacer(match: re.Match) -> str:
        """
        The core logic: only act on purely alphabetic words.
        """
        word1 = match.group(1)
        word2 = match.group(3)
        # print(word1, word2)
        # THE CRITICAL CHECK: Are both words composed *only* of letters?
        if word1.isalpha() and word2.isalpha():
            # Yes. Perform the replacement.
            return f'{word1} {word2}'
        else:
            # No. One or both words contain non-alphabetic characters (\, &, . etc).
            # Return the original matched text to skip the replacement.
            return match.group(0)

    # Loop until the text stabilizes
    previous_text = ""
    while text != previous_text:
        # print("New iteration....")
        previous_text = text
        # We use our replacer function to make the decision for each match
        text = re.sub(pattern, conditional_replacer, text)
    return text

# --- Final, Comprehensive Test Cases ---

text_to_fix = """
# 1. Simple case (SHOULD BE FIXED)
This is a sentence
that should be joined.

# 2. Word with backslash (SHOULD BE SKIPPED)
The command \section
is followed by \subsection.

# 3. LaTeX table (SHOULD BE SKIPPED)
\midrule
DPO & 0.36 & 0.31 \\
PPO & 0.26 & 0.23 \\
\midrule

# 4. Multi-line paragraph (SHOULD BE FULLY FIXED)
This is another
multi
line paragraph.
"""

# Apply the final, correct function
fixed_text = fix_newlines_final(text_to_fix)

print("--- Original Text ---")
print(text_to_fix)
print("\n--- After Final Correct Fix ---")
print(fixed_text)

--- Original Text ---

# 1. Simple case (SHOULD BE FIXED)
This is a sentence
that should be joined.

# 2. Word with backslash (SHOULD BE SKIPPED)
The command \section
is followed by \subsection.

# 3. LaTeX table (SHOULD BE SKIPPED)
\midrule
DPO & 0.36 & 0.31 \
PPO & 0.26 & 0.23 \
\midrule

# 4. Multi-line paragraph (SHOULD BE FULLY FIXED)
This is another
multi
line paragraph.


--- After Final Correct Fix ---

# 1. Simple case (SHOULD BE FIXED)
This is a sentence that should be joined.

# 2. Word with backslash (SHOULD BE SKIPPED)
The command \section
is followed by \subsection.

# 3. LaTeX table (SHOULD BE SKIPPED)
\midrule
DPO & 0.36 & 0.31 \
PPO & 0.26 & 0.23 \
\midrule

# 4. Multi-line paragraph (SHOULD BE FULLY FIXED)
This is another multi line paragraph.



In [41]:
with open('../data/arxiv/cleaned_DPO.txt', 'r', encoding='utf-8') as f:
    arxiv_paper = f.read()
text = arxiv_paper

In [44]:
# 1. fix \rev{..}{..}
import re
text = re.sub(r'\\rev\{[^{}]*\}\{([^{}]*)\}', r'\1', text)

# 2. collapse erroneous line breaks between words
pattern = r'\b([a-zA-Z]+)\b\s+\b([a-zA-Z]+)\b'
while True:
    new = re.sub(pattern, r'\1 \2', text)
    if new == text:
        break
    text = new
print(text)

\title{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}

\begin{abstract}
While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training.
Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF).
However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model.
In this paper we introduce a new parameterization of the reward model in RLHF that enables extraction of the corresponding optimal policy in clo

### Let's clean CoT

In [63]:
import re 

# Read main.tex file from data/arxiv directory
with open('../data/arxiv/CoT.tex', 'r') as f:
    main_tex = f.read()

In [None]:


def fix_newlines_final(text: str) -> str:
    """
    Fixes misplaced newlines with a strict set of rules:
    - Finds any two non-whitespace chunks separated by clean whitespace.
    - A fix is only applied IF AND ONLY IF both chunks are purely alphabetic.
    """
    # A general pattern to find candidates: two non-whitespace chunks
    # separated by whitespace that does NOT contain a backslash.
    # [^\\s] is a negated set: any character that is NOT a backslash or whitespace.
    # We use this to avoid matching the '\\' from a LaTeX line break.
    pattern = r'(\S+)([ \t]*\n[ \t\n]*)(\S+)'
    def conditional_replacer(match: re.Match) -> str:
        """
        The core logic: only act on purely alphabetic words.
        """
        word1 = match.group(1)
        word2 = match.group(3)
        # print(word1, word2)
        # THE CRITICAL CHECK: Are both words composed *only* of letters?
        if word1.isalpha() and word2.isalpha():
            # Yes. Perform the replacement.
            return f'{word1} {word2}'
        else:
            # No. One or both words contain non-alphabetic characters (\, &, . etc).
            # Return the original matched text to skip the replacement.
            return match.group(0)

    # Loop until the text stabilizes
    previous_text = ""
    while text != previous_text:
        # print("New iteration....")
        previous_text = text
        # We use our replacer function to make the decision for each match
        text = re.sub(pattern, conditional_replacer, text)
    return text

def clean_latex(latex_content: str) -> str:
    """
    Cleans a LaTeX string by extracting title, abstract, and main body,
    and removing commands and environments that don't contribute to the main text.
    """
    # 1. Extract title
    title_match = re.search(r'\\title\{(.*?)\}', latex_content, re.DOTALL)
    title = ""
    if title_match:
        title = title_match.group(1)
        title = re.sub(r'\\\\\s*', ' ', title)
        title = title.strip()

    # 2. Extract abstract
    abstract_match = re.search(r'\\begin\{abstract\}(.*?)\\end\{abstract\}', latex_content, re.DOTALL)
    abstract = abstract_match.group(1).strip() if abstract_match else ""

    # 3. Find start of main body (first section after document environment)
    body_start_index = -1
    doc_start_match = re.search(r'\\begin\{document\}', latex_content)
    if doc_start_match:
        # Find the first section after \begin{document}
        section_match = re.search(r'\\section', latex_content[doc_start_match.end():])
        if section_match:
            body_start_index = doc_start_match.end() + section_match.start()

    if body_start_index == -1:
        # Fallback if structure is unexpected, return what we have so far
        body = ""
    else:
        body = latex_content[body_start_index:]
    
    # 4. Find end of main body (before references, appendix, etc.)
    end_markers = [
        r'\\begin\{thebibliography\}', r'\\bibliography', r'\\appendix',
        r'\\section\*?\{Acknowledgements\}', r'\\section\*?\{Author Contributions\}'
    ]
    end_index = len(body)
    for marker_regex in end_markers:
        end_match = re.search(marker_regex, body)
        if end_match:
            end_index = min(end_index, end_match.start())
    
    body = body[:end_index]

    # Combine the parts we want to keep
    full_text = f'\\title{{{title}}}\n\n\\begin{{abstract}}\n{abstract}\n\\end{{abstract}}\n\n{body}\n\\end{{document}}'

    # Now apply cleaning operations from the original function
    cleaned_text = full_text

    # Get rid of \rev{..}{..}
    cleaned_text = re.sub(r'\\rev\{[^{}]*\}\{([^{}]*)\}', r'\1', cleaned_text)

    # Fix newlines between words
    cleaned_text = fix_newlines_final(cleaned_text)
    
    # remove \clearpage
    cleaned_text = re.sub(r'\\clearpage', '', cleaned_text)
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)


    return cleaned_text.strip()

In [74]:
cleaned_text = clean_latex(main_tex)

# Save cleaned text to file
with open('../data/arxiv/cleaned_CoT.txt', 'w') as f:
    f.write(cleaned_text)

## Let's paraphrase the article 10 times

In [31]:
with open('../data/arxiv/cleaned_DPO.txt', 'r', encoding='utf-8') as f:
    arxiv_paper = f.read()
cleaned_paper = arxiv_paper

In [32]:
print(cleaned_paper)

\title{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}

\begin{abstract}
While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training.
Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF).
However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model.
In this paper we introduce a new parameterization of the reward model in RLHF that enables extraction of the corresponding optimal policy in clo

In [None]:
# Exploration; making sure the function works
import sys
sys.path.append('..')

import utils.utils as utils

part = cleaned_text.split('\n\n')[2]
utils.query_llm(
            prompt=f"Paraphrase the following text, while keeping the latex formatting when it appears. Try to paraphrase the text as much as possible. Text: {part}\n\nParaphrased text: ",
            model="gpt-4.1", 
            temperature=1,
            top_p=0.9,
            system_prompt_included=False,
        )

In [53]:
# add .. path 
import sys
sys.path.append('..')
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import utils.utils as utils

prompt = {}
prompt['system'] = """You are a helpful assistant that paraphrases text. 
# Instructions
- Paraphrase while keeping latex formatting. 
- Paraphrase the text as much as possible, while keeping the meaning of the text.
- Leave proper nouns, equations, and domain-specific terminology unchanged.
- Only output the paraphrased text, no other text. """

def paraphrase_paragraph(part):
    if len(part) < 200:
        return part
    else:
        prompt['user'] = f"Text: {part}"
        return utils.query_llm(
            prompt=prompt,
            model="gpt-4.1", 
            temperature=1,
            top_p=0.95,
            system_prompt_included=True,
        )

def paraphrase_text(text):
    paragraphs = text.split('\n\n')
    # Parallelize the API calls
    with ThreadPoolExecutor(max_workers=16) as executor:
        paraphrased_paragraphs = list(tqdm(
            executor.map(paraphrase_paragraph, paragraphs),
            total=len(paragraphs)
        ))
    return '\n\n'.join(paraphrased_paragraphs)

In [55]:
for i in range(9):
    with open(f'../data/arxiv/cleaned_DPO_paraphrased_{i}.txt', 'w') as f:
        paraphrased_text = paraphrase_text(cleaned_paper)
        f.write(paraphrased_text)

100%|██████████| 45/45 [00:16<00:00,  2.66it/s]
100%|██████████| 45/45 [00:12<00:00,  3.49it/s]
100%|██████████| 45/45 [00:18<00:00,  2.42it/s]
100%|██████████| 45/45 [00:28<00:00,  1.59it/s]
100%|██████████| 45/45 [00:21<00:00,  2.11it/s]
100%|██████████| 45/45 [00:15<00:00,  2.96it/s]
100%|██████████| 45/45 [00:18<00:00,  2.43it/s]
100%|██████████| 45/45 [00:33<00:00,  1.33it/s]
100%|██████████| 45/45 [00:30<00:00,  1.48it/s]


In [None]:
# Read the paraphrased texts as a list
paraphrased_texts = []
for i in range(9):
    with open(f'../data/arxiv/cleaned_DPO_paraphrased_{i}.txt', 'r') as f:
        paraphrased_texts.append(f.read())


let's paraphrase as much as we can

In [None]:

# Exploration; making sure the function works
# For instance, if the original paragraph is 'Sally went to the store and bought a new dress because she was sad.', the paraphrased text should be 'Sally was sad. She went to the store and bought a new dress.'. 
part = cleaned_text.split('\n\n')[2]
print(part)
prompt = {}
prompt['system'] = """You are a helpful assistant that paraphrases text. 
# Instructions
- Read the text to understand what it's trying to say.
- Paraphrase the text as much as possible, while keeping the meaning of the text. 
- Paraphrase while keeping latex formatting. 
- Make sure that the syntax, sentence structure, and diction is completely different.
- Ensure the writing is written well.
- Only output the paraphrased text, no other text."""
prompt['user'] = f"Text: {part}\n\n"

utils.query_llm(
            prompt=prompt,
            model="gpt-4.1", 
            temperature=1,
            top_p=0.95,
            system_prompt_included=True,
        )

Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks.
The empirical gains can be striking.
For instance, prompting a PaLM 540B with just eight chain-of-thought exemplars achieves state-of-the-art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.
\end{abstract}


'Testing across three major language models demonstrates that chain-of-thought prompting enhances results on various tasks involving arithmetic, common sense, and symbolic reasoning. These improvements can be quite significant. For example, by supplying PaLM 540B with only eight chain-of-thought examples, the model attains top-tier accuracy on the GSM8K math word problem benchmark, outperforming even a finetuned GPT-3 that uses a verifier.'

In [86]:

# Exploration; making sure the function works
# For instance, if the original paragraph is 'Sally went to the store and bought a new dress because she was sad.', the paraphrased text should be 'Sally was sad. She went to the store and bought a new dress.'. 
part = '\section' + cleaned_text.split('\section')[2]
print(part)
prompt = {}
prompt['system'] = """You are a helpful assistant that paraphrases text. 
# Instructions
- Read the text to understand what it's trying to say.
- Paraphrase the text as much as possible, while keeping the meaning of the text. 
- Don't just paraphrase sentence by sentence, but rather take the freedom to paraphrase the text as a whole. 
- Paraphrase while keeping latex formatting. 
- Leave proper nouns, equations, and domain-specific terminology unchanged.
- Ensure the writing is clear and written well.
- Only output the paraphrased text, no other text."""
prompt['user'] = f"Text: {part}\n\n"

\section{Chain-of-Thought Prompting}
Consider one's own thought process when solving a complicated reasoning task such as a multi-step math word problem. 
It is typical to decompose the problem into intermediate steps and solve each before giving the final answer: \textit{``After Jane gives 2 flowers to her mom she has 10 $\ldots$ then after she gives 3 to her dad she will have 7 $\ldots$ so the answer is 7.''}
The goal of this paper is to endow language models with the ability to generate a similar \textit{chain of thought}---a coherent series of intermediate reasoning steps that lead to the final answer for a problem.
We will show that sufficiently large language models can generate chains of thought if demonstrations of chain-of-thought reasoning are provided in the exemplars for few-shot prompting.

\cref{fig:pull-figure} shows an example of a model producing a chain of thought to solve a math word problem that it would have otherwise gotten incorrect.
The chain of thought in this 

In [87]:
print(utils.query_llm(
            prompt=prompt,
            model="gpt-4.1", 
            temperature=1,
            top_p=0.95,
            system_prompt_included=True,
        ))

\section{Chain-of-Thought Prompting}
When approaching complex reasoning tasks—such as multi-step math word problems—people often naturally break the problem down into smaller parts, working through each one before arriving at a final solution. For example: \textit{``After Jane gives 2 flowers to her mom she has 10 $\ldots$ then after she gives 3 to her dad she will have 7 $\ldots$ so the answer is 7.''}
This work aims to equip language models with the capacity to generate a similar \textit{chain of thought}—that is, a logically connected sequence of intermediate reasoning steps that ultimately lead to the correct answer.

We demonstrate that large language models are able to produce chains of thought when they are given examples that explicitly model this reasoning process in few-shot prompts.

As illustrated in \cref{fig:pull-figure}, providing a chain of thought enables a model to solve a math word problem that it might otherwise answer incorrectly. These reasoning steps closely rese