# Create AIMO train dataset

So far I had the AIMO train dataset with solutions with a dictionary on the code. I want to move it to a csv so I can easier make experiments replacing MathInstruct by my problems.

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
from IPython.display import display, Markdown
import json
from tqdm.auto import tqdm

from transformers import AutoTokenizer

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Problems

In [None]:
problems = [
    dict(
        question='Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin?',
        answers=[
            """```python
from sympy import *

# Define symbols
x, k, l = symbols('x k l', real=True)

# Define the parabola and line equations
parabola = k*x**2 - 2*k*x + l
line = 4

# Solve for the intersection points
intersection_points = solve(Eq(parabola, line), x)

# Extract the x-coordinates of the intersection points
x_a, x_b = intersection_points

# Calculate the distance between the two points
distance = sqrt((x_b - x_a)**2)

# Set up the equation for the distance
distance_eq = Eq(distance, 6)

# Solve the equation for k
k_sol = solve(distance_eq, k)[0]

# Substitute the value of k into the x-coordinates
x_a = x_a.subs(k, k_sol)
x_b = x_b.subs(k, k_sol)

# Calculate the distances from the points to the origin
dist_a = sqrt(x_a**2 + line**2)
dist_b = sqrt(x_b**2 + line**2)

# Calculate the sum of the squares of the distances
sum_of_squares = simplify(dist_a**2 + dist_b**2)

print(sum_of_squares)
``````output
52
```
The answer is $\\boxed{52}$.""",
        """```python
from sympy import *
# Symbolic Calculation
k, l, x1, y1, x2, y2, d, x, y = symbols('k l x1 y1 x2 y2 d x y', real=True)

parabola = y - k*x**2 + 2*k*x - l
line = y - 4
# intersection equations
eq1 = Eq(parabola.subs({x: x1, y: y1}), 0)
eq2 = Eq(parabola.subs({x: x2, y: y2}), 0)
eq3 = Eq(line.subs({y: y1}), 0)
eq4 = Eq(line.subs({y: y2}), 0)
# distance equations
eq5 = Eq((x2 - x1)**2 + (y2 - y1)**2, 36) 
eq6 = Eq(x1**2 + y1**2 + x2**2 + y2**2, d)

# Solve the system of equations
solutions = solve((eq1, eq2, eq3, eq4, eq5, eq6), (l, x1, y1, x2, y2, d), dict=True)

# Print the solution
print(solutions[0][d])
``````output
52
```
The answer is $\\boxed{52}$.
"""
        ]
    ),
    dict(
        question='Each of the three-digits numbers $111$ to $999$ is coloured blue or yellow in such a way that the sum of any two (not necessarily different) yellow numbers is equal to a blue number. What is the maximum possible number of yellow numbers there can be?',
        answers=[
            """```python
# Greedy Algorithm
def is_yellow(x, A, B):
    if 2*x in A or any(x + item in A for item in A):
        return False
    if 2*x in B or any(x + item in B for item in A):
        return True
    return False

A = set()
B = set()

for value in range(999, 110, -1):
    if is_yellow(value, A, B):
        A.add(value)
    else:
        B.add(value)

print(len(A))
``````output
250
```
The answer is $\\boxed{250}$.""",
        ]
    ),
    dict(
        question="Let the `sparkle` operation on positive integer $n$ consist of calculating the sum of the digits of $n$ and taking its factorial, e.g. the sparkle of 13 is $4! = 24$. A robot starts with a positive integer on a blackboard, then after each second for the rest of eternity, replaces the number on the board with its sparkle. For some `special` numbers, if they're the first number, then eventually every number that appears will be less than 6. How many such special numbers are there with at most 36 digits?",
        answers=[
            """1. Sum the digits
2. Compute the factorial, f.e. $4! = 4*3*2*1 = 24$
3. How can the series stay below 6? 

```
4! = 24, sparkle(24) = 6! > 6
3! = 6, sparkle(6) = 6! > 6
2! = 2, sparkle(2) = 2! = 2 < 6
```

Thus the sum of the digits needs to be 2 or 1.
```python
numbers_starting_with_2 = 36
numbers_starting_with_1 = 36 + 36*35/2
numbers = numbers_starting_with_2 + numbers_starting_with_1
print(numbers)
``````output
702
```
The answer is $\\boxed{702}$.""",
        ]
    ),
    dict(
        question='What is the minimum value of $5x^2+5y^2-8xy$ when $x$ and $y$ range over all real numbers such that $\\|x-2y\\| + \\|y-2x\\| = 40$?  ',
        answers=[
            """```python
from sympy import *

# Define symbols
x, y = symbols('x y', real=True)

# Define the objective function
f = 5*x**2 + 5*y**2 - 8*x*y

# Use the constraint to eliminate one variable
eq = Eq(abs(x - 2*y) + abs(y - 2*x), 40)
y_sol = solve(eq, y)

# Substitute the solution into the objective function
f_x = f.subs(y, y_sol[0])

# Find the critical points
df_dx = diff(f_x, x)
critical_points = solve(df_dx)

# Evaluate the objective function at the critical points and the boundary points
min_val = f_x.subs(x, critical_points[0])
for i in range(1, len(critical_points)):
    min_val = min(min_val, f_x.subs(x, critical_points[i]))

# Print the minimum value
print(min_val)
``````output
800
```
The answer is $\\boxed{800}$.""",
        """```python
import numpy as np
from scipy.optimize import minimize

def objective(vars):
    x, y = vars
    return 5 * x**2 + 5 * y**2 - 8 * x * y

def constraint(vars):
    x, y = vars
    return abs(x - 2*y) + abs(y - 2*x) - 40

# Initial guess
initial_guess = [0, 0]

# Constraint dictionary
con = {'type': 'eq', 'fun': constraint}

# Perform the optimization
result = minimize(objective, initial_guess, constraints=con, method='SLSQP', options={'disp': True})

print(int(round(result.fun)))
``````output
800
```
The answer is $\\boxed{800}$.""",
        ]
    ),
    dict(
        question='There exists a unique increasing geometric sequence of five 2-digit positive integers. What is their sum?',
        answers=[
            """```python
def find_solution():
    for a1 in range(10,99+1):
        for a2 in range(a1+1, 99+1):
            for a3 in range(a2+1, 99+1):
                for a4 in range(a3+1, 99+1):
                    for a5 in range(a4+1, 99+1):
                        if a1 / a2 == a2 / a3 == a3 / a4 == a4 / a5:
                            return (a1, a2, a3, a4, a5)
numbers = find_solution()
print(sum(numbers))
``````output
211
```
The answer is $\\boxed{211}$.""",
        ]
    ),
    dict(
        question='Suppose that we roll four 6-sided fair dice with faces numbered 1 to~6. Let $a/b$ be the probability that the highest roll is a 5, where $a$ and $b$ are relatively prime positive integers. Find $a + b$.',
        answers=[
            """```python
from sympy import Rational

# Define the probability parts
prob_no_6 = Rational(5, 6) ** 4
prob_no_5_if_no_6 = (Rational(4, 5) ** 4)
prob_at_least_one_5_if_no_6 = 1 - prob_no_5_if_no_6

# Combined probability
P = prob_no_6 * prob_at_least_one_5_if_no_6

# Convert to a/b form and compute a + b
a = P.numerator
b = P.denominator

print(a + b)
``````output
185
```
The answer is $\\boxed{185}$.""",
        """The number is small enough to brute force all outcomes.
```python
from sympy import *

# Define symbols
total_outcomes = 6**4
successful_outcomes = 0

# Iterate over all possible outcomes
for die1 in range(1, 7):
    for die2 in range(1, 7):
        for die3 in range(1, 7):
            for die4 in range(1, 7):
                # Check if the highest roll is a 5
                if max(die1, die2, die3, die4) == 5:
                    successful_outcomes += 1

# Calculate the probability
probability = Rational(successful_outcomes, total_outcomes)

# Get the numerator (a) and denominator (b) of the probability
a = probability.numerator
b = probability.denominator

# Print the sum of a and b
print(a + b)
``````output
185
```
The answer is $\\boxed{185}$.""",
        ]
    ),
    dict(
        question='The points $\left(x, y\\right)$ satisfying $((\\vert x + y \\vert - 10)^2 + ( \\vert x - y \\vert - 10)^2)((\\vert x \\vert - 8)^2 + ( \\vert y \\vert - 8)^2) = 0$ enclose a convex polygon. What is the area of this convex polygon?',
        answers=[
            """```python
from sympy import *
x, y = symbols('x y', real=True)
eq1 = Eq((abs(x) - 8)**2, 0)
eq2 = Eq((abs(y) - 8)**2, 0)
eq3 = Eq((abs(x + y) - 10)**2, 0)
eq4 = Eq((abs(x - y) - 10)**2, 0)
points = solve((eq1, eq2), (x, y))
points.extend(solve((eq3, eq4), (x, y)))

# Find the convex hull
hull = convex_hull(*points)
# Create a Polygon object from the convex hull vertices
polygon = Polygon(*hull.vertices)
# Calculate the area of the polygon
area = polygon.area
# Print the area
print(area)
``````output
320
```
The answer is $\\boxed{320}$.""",
        """```python
from sympy import *

points = []

for x in range(-100, 100):
    for y in range(-100, 100):
        if ((abs(x + y)-10)**2 + (abs(x - y)-10)**2) * ((abs(x)-8)**2 + (abs(y)-8)**2) == 0:
            points.append(Point(x, y))

# Find the convex hull
hull = convex_hull(*points)

# Create a Polygon object from the convex hull vertices
polygon = Polygon(*hull.vertices)

# Calculate the area of the polygon
area = polygon.area

# Print the area
print(area)
``````output
320
```
The answer is $\\boxed{320}$.""",
        ]
    ),
    dict(
        question='Let $ABCD$ be a unit square. Let $P$ be the point on $AB$ such that $\|AP\| = 1/{20}$ and let $Q$ be the point on $AD$ such that $\|AQ\| = 1/{24}$. The lines $DP$ and $BQ$ divide the square into four regions. Find the ratio between the areas of the largest region and the smallest region.',
        answers=[
            """The largest and the smallest region are proportional, thus we can compute the ratio of the areas simply by dividing the know sides.
```python
ratio = 1*1/(1/20*1/24)
print(ratio)
``````output
480
```
The answer is $\\boxed{480}$.""",
        """```python
from sympy import *

# Points
A = Point(0, 0)
B = Point(1, 0)
C = Point(1, 1)
D = Point(0, 1)
P = Point(Rational(1,20), 0)
Q = Point(0, Rational(1,24))

# Lines
line_DP = Line(D, P)
line_BQ = Line(B, Q)

# Intersection
intersection = line_DP.intersection(line_BQ)[0]

# Areas
# Region 1: Quadrilateral A, P, Intersection, Q
area1 = Polygon(A, P, intersection, Q).area

# Region 2: P, B, Intersection
area2 = Polygon(P, B, intersection).area

# Region 3: B, C, D, Intersection
area3 = Polygon(B, C, D, intersection).area

# Region 4: D, Q, Intersection
area4 = Polygon(D, Q, intersection).area

ratio = max([area1, area2, area3, area4]) / min([area1, area2, area3, area4])
print(ratio)
``````output
480
```
The answer is $\\boxed{480}$.""",
        """```python
import sympy as sp

def triangle_area(p1, p2, p3):
    matrix = sp.Matrix([
        [p1.x, p1.y, 1],
        [p2.x, p2.y, 1],
        [p3.x, p3.y, 1]
    ])
    return sp.Abs(matrix.det()) / 2

# Define the coordinates of points
A = sp.Point(0, 0)
B = sp.Point(0, 1)
C = sp.Point(1, 1)
D = sp.Point(1, 0)
P = sp.Point(0, 1/20)
Q = sp.Point(1/24, 0)

# Define lines
DP = sp.Line(D, P)
BQ = sp.Line(B, Q)

intersection = DP.intersection(BQ)[0]


area_small = triangle_area(A, Q, intersection) + triangle_area(A, P, intersection)
area_large = triangle_area(C, B, intersection) + triangle_area(C, D, intersection)
ratio = area_large/area_small
print(ratio)
``````output
480
```
The answer is $\\boxed{480}$.""",
        ]
    ),
    dict(
        question='A function $f: \mathbb N \\to \mathbb N$ satisfies the following two conditions for all positive integers $n$:$f(f(f(n)))=8n-7$ and $f(2n)=2f(n)+1$. Calculate $f(100)$.',
        answers=[
            """$f(n) = 2n \\rightarrow f(f(f(n))) = 8n$


$f(n) = 2n - 1 \\rightarrow f(f(f(n))) = 8n - 7$ 
```python
def f(x):
    return 2*x - 1
print(f(100))
``````output
199
```
The answer is $\\boxed{199}$.""",]
    ),
    dict(
        question='For how many positive integers $m$ does the equation $\\vert \\vert x-1 \\vert -2 \\vert=\\frac{m}{100}$ have $4$ distinct solutions?',
        answers=[
            """```python
from sympy import symbols, Eq, solve, Abs

# Define the variable
x = symbols('x', real=True)

# Set up the equation with m/100 on the right-hand side
m = symbols('m', real=True, positive=True)  # m is a positive real number
equation = Eq(Abs(Abs(x - 1) - 2), m / 100)

# Solve the equation for x in terms of m
solutions = solve(equation, x)

# Now check for which m we get 4 distinct solutions
# This is done by evaluating when the solutions are distinct
# For each integer m from 1 to some upper limit (since m is m/100, the practical range of m should be checked)
distinct_solutions = []
for mi in range(1, 1000):  # Check for m from 1 to 999
    sols = set([sol.subs(m, mi) for sol in solutions])
    valid_sols = set(sol for sol in sols if sol != nan)
    if len(valid_sols) == 4:
        distinct_solutions.append(mi)

print(len(distinct_solutions))
``````output
199
```
The answer is $\\boxed{199}$.""",
            """```python
from sympy import symbols, Eq, solve, Abs

# Define the variable
x = symbols('x', real=True)

# Set up the equation with m/100 on the right-hand side
m = symbols('m', real=True, positive=True)  # m is a positive real number
equation = Eq(Abs(Abs(x - 1) - 2), m / 100)

# Solve the equation for x in terms of m
solutions = solve(equation, x)
print(solutions)
``````output
[-m/100 - 1, m/100 + 3, Piecewise((3 - m/100, m <= 200), (nan, True)), Piecewise((m/100 - 1, m < 200), (nan, True))]
```

For the equation to have 4 distinct solutions, $ m $ must be smaller than $200$, and since the problem says that it is a positive number it also needs to be bigger than $0$. 

Let's compute the number of possible values using python.

```python
# Calculate the number of integers m where 0 < m < 200
m_values = range(1, 200)
print(len(m_values))
``````output
199
```
The answer is $\\boxed{199}$.""",]
    ),
]
print(len(problems))

## Create dataset

### Code

In [None]:
def get_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

tokenizer = get_tokenizer('/home/gbarbadillo/data/deepseekmath')

In [None]:
def parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if matches:
        return text_to_int_answer(matches[-1])
    return None

def text_to_int_answer(text):
    try:
        answer = float(text)
        if answer < 0 or not answer.is_integer():
            return None
        return int(answer)
    except (ValueError, OverflowError):
        return None

assert 5 == text_to_int_answer('5')
assert 5 == text_to_int_answer('5.0')
assert text_to_int_answer('-1') is None
assert text_to_int_answer('0.5') is None
assert text_to_int_answer('pi') is None

### Create

In [None]:
questions = [problem['question'] for problem in problems]
answers = [problem['answers'][np.argmin([len(answer) for answer in problem['answers']])] for problem in problems]

In [None]:
df = pd.DataFrame({'problem': questions, 'solution': answers})
df['answer'] = df.solution.apply(parse_boxed_answer)
df['input_tokens'] = df.problem.apply(lambda x: len(tokenizer.tokenize(x)))
df['output_tokens'] = df.solution.apply(lambda x: len(tokenizer.tokenize(x)))
df['total_tokens'] = df['input_tokens'] + df['output_tokens']
df['id'] = range(len(df))

df.to_csv('/mnt/hdd0/Kaggle/aimo/data/AIMO_train_with_solutions.csv', index=False)

In [None]:
pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATHCodeInstruct_curated.csv').head()

In [None]:
pd.read_csv('/mnt/hdd0/Kaggle/aimo/data/train_with_solutions.csv').head()