# Data exploration

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
from IPython.display import display, Markdown

from transformers import AutoTokenizer

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## MATH dataset

https://www.kaggle.com/datasets/alejopaullier/aimo-external-dataset

### Code

In [None]:
def render_problem(idx):
    row = df.loc[idx]
    title = f'## Problem {idx+1}: {row["level"]} - {row["type"]} {row["source"]}/{row["stage"]}'
    display(Markdown(title))
    display(Markdown(f"### Problem\n{row['problem']}"))
    display(Markdown(f"### Solution\n{row['solution']}"))

In [None]:
def text_to_int_answer(text):
    answer = int(float(text))
    if answer < 0:
        return None
    return answer

def parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if matches:
        return text_to_int_answer(matches[-1])
    return None

def safe_parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if len(matches) == 1:
        return text_to_int_answer(matches[0])
    return None

In [None]:
def get_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

tokenizer = get_tokenizer('/home/gbarbadillo/data/deepseekmath')

### Exploration

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/MATH_and_GSM8k.csv')
print(len(df))
df = df[df.source == 'MATH']
print(len(df))
df.head()

In [None]:
render_problem(150)

In [None]:
df.level.value_counts()

In [None]:
df.type.value_counts()

### Problems with integer answers

In [None]:
df['n_boxed'] = df.solution.str.count(r'\\boxed')
df['n_boxed'].value_counts()

In [None]:
df[df.n_boxed > 1]

In [None]:
render_problem(65)

We should avoid problems with more than one solution.

In [None]:
df['parsed_answer'] = df.solution.apply(safe_parse_boxed_answer)
len(df), len(df[df.parsed_answer.notnull()])

In [None]:
df = df[df.parsed_answer.notnull()]
df = df[df.n_boxed == 1]
df.reset_index(drop=True, inplace=True)
print(len(df))
df.head()

In [None]:
df.level.value_counts()

In [None]:
df.stage.value_counts()

In [None]:
df.type.value_counts()

In [None]:
render_problem(200)

### Token distribution

In [None]:
df.head()

In [None]:
df['input_tokens'] = df.problem.apply(lambda x: len(tokenizer.tokenize(x)))
df['output_tokens'] = df.solution.apply(lambda x: len(tokenizer.tokenize(x)))

In [None]:
bins = np.linspace(0, 2000, 500)
for key in ['input_tokens', 'output_tokens']:
    plt.hist(df[key], bins=bins, alpha=0.5, label=key, density=True, cumulative=1)
plt.xlim(0, 1000)
plt.ylim(0, 1)
plt.grid()
plt.legend(loc=0);

Around 90% of the solutions have less than 400 tokens.

Let's see if there is any relation between the difficulty and token length.

In [None]:
bins = np.arange(0, 2000)
for key in ['input_tokens', 'output_tokens']:
    for level in sorted(df.level.unique())[:-1]:
        plt.hist(df[df.level == level][key], bins=bins, alpha=0.5, label=level, density=True, cumulative=1)
    if key == 'input_tokens':
        plt.xlim(0, 200)
    else:
        plt.xlim(0, 1000)
    plt.ylim(0, 1)
    plt.grid()
    plt.legend(loc=0);
    plt.title(f'{key} distribution')
    plt.xlabel('Tokens')
    plt.ylabel('Cumulative probability')
    plt.show()

As expected the more difficult problems have longer answers and descriptions. Very beautiful graph.

### Train and test distribution

In [None]:
df.stage.value_counts()

In [None]:
df[df.stage == 'test']['level'].value_counts()

### Save for later use

I'm going to save the file for later being able to use it for creating few-shot prompts.

In [None]:
df['total_tokens'] = df.input_tokens + df.output_tokens
df.columns = [column.replace('parsed_', '') for column in df.columns]
df.sort_values('stage', inplace=True)
df.sort_values('level', inplace=True)
df['id'] = np.arange(len(df))
df['answer'] = df['answer'].astype(int)
df.head()

In [None]:
(df.answer % 1000).value_counts().head(10)

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH.csv', index=False)

In [None]:
for stage in ['train', 'test']:
    df[df.stage == stage].to_csv(f'/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_{stage}.csv', index=False)
    print(stage, len(df[df.stage == stage]))

## TODO

- [x] Focus on problems that have integer and non-negative answers
- [x] What is the distribution of output and input tokens?
- [ ] What if I create smaller versions of the test set?