In [118]:
from datasets import load_dataset

math_dataset = load_dataset("hendrycks/competition_math", split="test")

## Phase 1. Simple test with single problem.
Let's do a simple test. We'll:
1. Choose a random math problem from the MATH test dataset.
2. Ask the model to solve it using the standard zero-shot.
3. Print out the model answer.

In [97]:
import ollama
import random

rand_entry = random.choice(math_dataset)
rand_problem = f"{rand_entry['problem']} \n\n As you solve the problem think step-by-step. Explain the solution and enclose the ultimate answer in \\boxed{{}} with no text additional text explanation."

def math_answerer(math_problem):
    model = 'llama3'
    messages = [
        {
            'role': 'user',
            'content': math_problem
        }
    ]

    # Make the request to the Ollama API
    response = ollama.chat(model=model, messages=messages, stream=False)
    return response["message"]["content"]

model_full_solution = math_answerer(rand_problem)

print(f"The random entry chosen from MATH is: {rand_entry}")
print("----")
print(f"The random problem is: {rand_problem}")
print("----")
print(f"The model full solution is: {model_full_solution}.")

The random entry chosen from MATH is: {'problem': 'If $*$ represents an operation defined by $a*b=a^b+b$, find $(1*2)*3$.', 'level': 'Level 2', 'type': 'Algebra', 'solution': 'We have: $(1*2)*3=(1^2+2)*3=3*3=3^3+3=27+3=\\boxed{30}$.'}
----
The random problem is: If $*$ represents an operation defined by $a*b=a^b+b$, find $(1*2)*3$. 

 As you solve the problem think step-by-step. Explain the solution and enclose the ultimate answer in \boxed{} with no text additional text explanation.
----
The model full solution is: Let's break it down step by step:

$(1*2)*3$ means we need to evaluate $1^2+2$, and then multiply the result by 3.

To find $(1*2)$, we substitute 1 and 2 into the operation definition: $1*2=1^2+2=1+2=3$

So now we have $(3)*3$. Again, we substitute 3 into the operation definition: $3*3=3^3+3=27+3=30$

Therefore:

\boxed{30}.


## Phase 2: Now extract the answer and compare to the official anwser
We'll build on Phase 1 adding in new functions to extract the final \boxed{} answer from the model and compare it to the offical answer from MATH.

In [116]:
import re

# Function to extract the **last** \boxed{}.
# In the simplest case (e.g. Phase 1) the model is likely to output only one \boxed{} solution.
# However, the analogical reasoning prompt requests the model think about problems and solutions
# to related problems, enclosing those in \boxed{} before solving the problem at hand, which
# should be the last \boxed{} statement.
def find_closing_boxed(text):
    matches = re.finditer(r'\\boxed\{', text) # Find all answers
    start_index = [match.end() for match in matches][-1] # Get the last \boxed{ answer
    substring = text[start_index:]
    brace_count = 1

    for end_index, char in enumerate(substring, start = start_index):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
        else:
            continue

        if brace_count == 0:
            return [start_index, end_index]
        
    # Return None if no matching brace is found (though this should not happen in well-formed strings)
    return [start_index, -1]

def extract_final_answer(math_solution):
    start_index, end_index = find_closing_boxed(math_solution)
    return math_solution[start_index:end_index].strip()

model_final_answer = extract_final_answer(model_full_solution)
offical_final_answer = extract_final_answer(rand_entry["solution"])

print(f"The full model solution is: {model_full_solution}")
print("----")
print(f"The final model answer is: {model_final_answer}")
print("----")
print(f"The official answer is: {offical_final_answer}")
print("----")
print(f"Do the answers match: {model_final_answer  == offical_final_answer}")

The full model solution is: Let's break it down step by step:

$(1*2)*3$ means we need to evaluate $1^2+2$, and then multiply the result by 3.

To find $(1*2)$, we substitute 1 and 2 into the operation definition: $1*2=1^2+2=1+2=3$

So now we have $(3)*3$. Again, we substitute 3 into the operation definition: $3*3=3^3+3=27+3=30$

Therefore:

\boxed{30}
----
The final model answer is: 30
----
The official answer is: 30
----
Do the answers match: True


In [121]:
rand_problem = random.choice(math_dataset)

analogical_prompt = f"""Your task is to tackle mathematical problems. When presented with a math problem, recall relevant problems as examples. Afterward, proceed to solve the initial problem.

# Problem: {rand_problem['problem']}

# Instructions:
## Relevant Problems:
Recall three examples of math problems that are relevant to the initial problem. Your problems should be distinct from each other and from the initial problem (e.g., involving different numbers and names). For each problem:
- After "Q: ", describe the problem
- After "A: ", explain the solution and enclose the ultimate answer in \\boxed{{}}.

## Solve the Initial Problem: Q: Copy and paste the initial problem here. A: Explain the solution and enclose the ultimate answer in \\boxed{{}} here."""

model_full_solution = math_answerer(analogical_prompt)

model_final_answer = extract_final_answer(model_full_solution)
offical_final_answer = extract_final_answer(rand_entry["solution"])

print(f"The full model solution is: {model_full_solution}")
print("----")
print(f"The final model answer is: {model_final_answer}")
print("----")
print(f"The official answer is: {offical_final_answer}")
print("----")
print(f"Do the answers match: {model_final_answer  == offical_final_answer}")

The full model solution is: I'm ready to tackle this mathematical challenge!

**Relevant Problems:**

1. **Q:** Suppose $a$ and $b$ are inversely proportional. If $a=12$ when $b=4$, find the value of $b$ when $a=8$.
**A:** $\boxed{6}$
To solve this problem, we can set up a proportionality equation: $\frac{a}{b} = k$, where $k$ is the constant of proportionality. Substituting given values, we get $\frac{12}{4}=k$. Then, substituting $a=8$, we find that $b=6$.

2. **Q:** Suppose $x$ and $y$ are inversely proportional. If $x=16$ when $y=2$, find the value of $y$ when $x=10$.
**A:** $\boxed{4}$
Using a proportionality equation, we get $\frac{x}{y}=k$. Substituting given values, we find that $k=\frac{16}{2}=8$. Then, substituting $x=10$, we find that $y=\frac{10}{8}=\boxed{\frac{5}{4}}$.

3. **Q:** Suppose $p$ and $q$ are inversely proportional. If $p=15$ when $q=3$, find the value of $q$ when $p=20$.
**A:** $\boxed{\frac{1}{2}}$
Using a proportionality equation, we get $\frac{p}{q}=k$. Sub

In [189]:
import ollama
import random
import re
import time
import json
from datasets import load_dataset

math_dataset = load_dataset("hendrycks/competition_math", split="test")

def math_answerer(math_problem, retries = 3, delay=5):
    model = 'llama3'
    messages = [
        {
            'role': 'user',
            'content': math_problem
        }
    ]

    attempt = 0
    while attempt < retries:
        try:
            response = ollama.chat(model=model, messages=messages, stream=False)
            return response["message"]["content"]
        
        except Exception as e:
            attempt += 1
            print(f"Encountered error {e}. Waiting {delay} seconds and retrying. Attempt{attempt}...")
            time.sleep(delay)

    print("All retries failed.")
    return None


# Function to extract the **last** \boxed{}.
# In the simplest case (e.g. Phase 1) the model is likely to output only one \boxed{} solution.
# However, the analogical reasoning prompt requests the model think about problems and solutions
# to related problems, enclosing those in \boxed{} before solving the problem at hand, which
# should be the last \boxed{} statement.
def find_closing_boxed(text):
    if text is None:
        return [None, None]
    
    matches = list(re.finditer(r'\\boxed\{', text)) # Find all answers
    if not matches:
        return [None, None]
    
    start_index = matches[-1].end() # Get the last \boxed{ answer
    substring = text[start_index:]
    brace_count = 1

    for end_index, char in enumerate(substring, start = start_index):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
        else:
            continue

        if brace_count == 0:
            return [start_index, end_index]
        
    # Return None if no matching brace is found (though this should not happen in well-formed strings)
    return [start_index, -1]

def extract_final_answer(math_solution):
    if math_solution is None:
        return None
    start_index, end_index = find_closing_boxed(math_solution)
    return math_solution[start_index:end_index].strip()

# Initialize result dictionary
cot_results = {}
cot_prompt = f"{rand_problem} \n\n As you solve the problem think step-by-step. Explain the solution and enclose the ultimate answer in \\boxed{{}} with no text additional text explanation."
# Randomly sample indices
random.seed(39)  # Seed for reproducibility if needed
rand_indices = random.sample(range(len(math_dataset)), 500)

epochs = 5
for epoch in range(1, epochs + 1):
    cot_results[epoch] = {}

    for idx, rand_problem_index in enumerate(rand_indices, start=1):
        print(f"Epoch: {epoch}, Problem: {idx}.")
        rand_problem = math_dataset[rand_problem_index]["problem"]
        cot_prompt = f"{rand_problem} \n\n Solve the problem by thinking step by step. Explain the solution and enclose the ultimate answer in \\boxed{{}} with no additional text explanation."
        model_full_solution = math_answerer(cot_prompt)
        model_final_answer = extract_final_answer(model_full_solution)
        official_final_answer = extract_final_answer(math_dataset[rand_problem_index]["solution"])

        cot_results[epoch][idx] = {
            "problem_index": rand_problem_index,
            "model_full_solution": model_full_solution,
            "model_final_answer": model_final_answer,
            "official_final_answer": official_final_answer,
            "answer_correct?": model_final_answer == official_final_answer
        }

with open('cot_results.json', 'w') as outfile:
    json.dump(cot_results, outfile, indent=4)

Epoch: 1, Problem: 1.
Epoch: 1, Problem: 2.
Epoch: 1, Problem: 3.
Epoch: 1, Problem: 4.
Epoch: 1, Problem: 5.
Epoch: 1, Problem: 6.
Epoch: 1, Problem: 7.
Epoch: 1, Problem: 8.
Epoch: 1, Problem: 9.
Epoch: 1, Problem: 10.
Epoch: 1, Problem: 11.
Epoch: 1, Problem: 12.
Epoch: 1, Problem: 13.
Epoch: 1, Problem: 14.
Epoch: 1, Problem: 15.
Epoch: 1, Problem: 16.
Epoch: 1, Problem: 17.
Epoch: 1, Problem: 18.
Epoch: 1, Problem: 19.
Epoch: 1, Problem: 20.
Epoch: 1, Problem: 21.
Epoch: 1, Problem: 22.
Epoch: 1, Problem: 23.
Epoch: 1, Problem: 24.
Epoch: 1, Problem: 25.
Epoch: 1, Problem: 26.
Epoch: 1, Problem: 27.
Epoch: 1, Problem: 28.
Epoch: 1, Problem: 29.
Epoch: 1, Problem: 30.
Epoch: 1, Problem: 31.
Epoch: 1, Problem: 32.
Epoch: 1, Problem: 33.
Epoch: 1, Problem: 34.
Epoch: 1, Problem: 35.
Epoch: 1, Problem: 36.
Epoch: 1, Problem: 37.
Epoch: 1, Problem: 38.
Epoch: 1, Problem: 39.
Epoch: 1, Problem: 40.
Epoch: 1, Problem: 41.
Epoch: 1, Problem: 42.
Epoch: 1, Problem: 43.
Epoch: 1, Problem: 4

In [187]:
import re

def clean_answer(answer):
    # Step 1: Remove common prefixes like "n=" and "k="
    answer = re.sub(r'\b[nk]\s*=\s*', '', answer)
    
    # Step 2: Normalize spaces around LaTeX commands
    answer = re.sub(r'\s*\\\s*', '\\', answer)
    
    # Step 3: Remove spaces before and after { and }
    answer = re.sub(r'\s*{\s*', '{', answer)
    answer = re.sub(r'\s*}\s*', '}', answer)
    
    # Step 4: Normalize spaces within text inside \text{}
    answer = re.sub(r'\\text{([^}]*)}', lambda m: f'\\text{{{m.group(1).strip()}}}', answer)
    
    # Step 5: Remove extra spaces
    answer = re.sub(r'\s+', ' ', answer).strip()
    
    return answer

def test_clean_answer():
    test_cases = [
        {"input": "25 \\text{ km}", "expected": "25 \\text{km}"},
        {"input": "25 \\, \\text{km}", "expected": "25 \\text{km}"},
        {"input": "\\16", "expected": "16"},
        {"input": "16", "expected": "16"},
        {"input": "k = 3", "expected": "3"},
        {"input": "n= 42", "expected": "42"},
        {"input": " \\boxed{ 42 }", "expected": "\\boxed{42}"}
    ]

    for idx, case in enumerate(test_cases):
        input_answer = case["input"]
        expected_output = case["expected"]
        cleaned_output = clean_answer(input_answer)
        assert cleaned_output == expected_output, f"Test case {idx + 1} failed: {cleaned_output} != {expected_output}"
        print(f"Test case {idx + 1} passed: {cleaned_output} == {expected_output}")

# Run the tests
test_clean_answer()


error: bad escape (end of pattern) at position 0