# Data exploration

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
from IPython.display import display, Markdown
import json
from tqdm.auto import tqdm

from transformers import AutoTokenizer

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
def text_to_int_answer(text):
    try:
        answer = float(text)
        if answer < 0 or not answer.is_integer():
            return None
        return int(answer)
    except (ValueError, OverflowError):
        return None

assert 5 == text_to_int_answer('5')
assert 5 == text_to_int_answer('5.0')
assert text_to_int_answer('-1') is None
assert text_to_int_answer('0.5') is None
assert text_to_int_answer('pi') is None

In [None]:
def get_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

tokenizer = get_tokenizer('/home/gbarbadillo/data/deepseekmath')

## MATH dataset

https://www.kaggle.com/datasets/alejopaullier/aimo-external-dataset

### Code

In [None]:
def render_problem(idx):
    row = df.loc[idx]
    title = f'## Problem {idx+1}: {row["level"]} - {row["type"]} {row["source"]}/{row["stage"]}'
    display(Markdown(title))
    display(Markdown(f"### Problem\n{row['problem']}"))
    display(Markdown(f"### Solution\n{row['solution']}"))

In [None]:


def parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if matches:
        return text_to_int_answer(matches[-1])
    return None

def safe_parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if len(matches) == 1:
        return text_to_int_answer(matches[0])
    return None

### Exploration

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/MATH_and_GSM8k.csv')
print(len(df))
df = df[df.source == 'MATH']
print(len(df))
df.head()

In [None]:
render_problem(150)

In [None]:
df.level.value_counts()

In [None]:
df.type.value_counts()

### Problems with integer answers

In [None]:
df['n_boxed'] = df.solution.str.count(r'\\boxed')
df['n_boxed'].value_counts()

In [None]:
df[df.n_boxed > 1]

In [None]:
render_problem(65)

We should avoid problems with more than one solution.

In [None]:
df['parsed_answer'] = df.solution.apply(safe_parse_boxed_answer)
len(df), len(df[df.parsed_answer.notnull()])

In [None]:
df = df[df.parsed_answer.notnull()]
df = df[df.n_boxed == 1]
df.reset_index(drop=True, inplace=True)
print(len(df))
df.head()

In [None]:
df.level.value_counts()

In [None]:
df.stage.value_counts()

In [None]:
df.type.value_counts()

In [None]:
render_problem(200)

### Token distribution

In [None]:
df.head()

In [None]:
df['input_tokens'] = df.problem.apply(lambda x: len(tokenizer.tokenize(x)))
df['output_tokens'] = df.solution.apply(lambda x: len(tokenizer.tokenize(x)))

In [None]:
bins = np.linspace(0, 2000, 500)
for key in ['input_tokens', 'output_tokens']:
    plt.hist(df[key], bins=bins, alpha=0.5, label=key, density=True, cumulative=1)
plt.xlim(0, 1000)
plt.ylim(0, 1)
plt.grid()
plt.legend(loc=0);

Around 90% of the solutions have less than 400 tokens.

Let's see if there is any relation between the difficulty and token length.

In [None]:
bins = np.arange(0, 2000)
for key in ['input_tokens', 'output_tokens']:
    for level in sorted(df.level.unique())[:-1]:
        plt.hist(df[df.level == level][key], bins=bins, alpha=0.5, label=level, density=True, cumulative=1)
    if key == 'input_tokens':
        plt.xlim(0, 200)
    else:
        plt.xlim(0, 1000)
    plt.ylim(0, 1)
    plt.grid()
    plt.legend(loc=0);
    plt.title(f'{key} distribution')
    plt.xlabel('Tokens')
    plt.ylabel('Cumulative probability')
    plt.show()

As expected the more difficult problems have longer answers and descriptions. Very beautiful graph.

### Train and test distribution

In [None]:
df.stage.value_counts()

In [None]:
df[df.stage == 'test']['level'].value_counts()

### Save for later use

I'm going to save the file for later being able to use it for creating few-shot prompts.

In [None]:
df['total_tokens'] = df.input_tokens + df.output_tokens
df.columns = [column.replace('parsed_', '') for column in df.columns]
df.sort_values('stage', inplace=True)
df.sort_values('level', inplace=True)
df['id'] = np.arange(len(df))
df['answer'] = df['answer'].astype(int)
df.head()

In [None]:
(df.answer % 1000).value_counts().head(10)

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH.csv', index=False)

In [None]:
for stage in ['train', 'test']:
    df[df.stage == stage].to_csv(f'/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_{stage}.csv', index=False)
    print(stage, len(df[df.stage == stage]))

More versions of the test set.

In [None]:
test = df[df.stage == 'test']
test = test[test.level.isin(['Level 4', 'Level 5'])]
test = test[test.problem.apply(lambda x: '[asy]' not in x)]
test = test[test.total_tokens < 1000]
len(test)

1211 samples after filtering, previous test set was 2828 samples

In [None]:
test.sort_values(['level', 'type'], inplace=True)
test.head()

In [None]:
test.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_test_45.csv', index=False)

In [None]:
test = test[test.level.isin(['Level 5'])]
test.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_test_5.csv', index=False)

In [None]:
len(test)

## MathCodeInstruct

https://huggingface.co/datasets/MathLLM/MathCodeInstruct

### Code

In [None]:
def read_jsonl(filepath):
    data = []
    # Open the file and read line by line
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip out any extra whitespace and parse the JSON object
            json_obj = json.loads(line.strip())
            # Append the parsed JSON object to the data list
            data.append(json_obj['messages'])
    return data

In [None]:
def format_answer(sample):
    text = ''
    for content in sample[2]['content'][:-1]: # the last is the response
        if content['type'] == 'text':
            text += uniform_latex_format(content['content'])
        elif content['type'] == 'code':
            text += f'\n\n```python\n{content["content"]}\n``````output\n'
        elif content['type'] == 'execution':
            text += f'{content["content"]}\n```\n\n'
        else:
            raise ValueError(f'Unknown content type: {content["type"]}')
    return text

def get_problem(sample):
    return sample[1]['content'][0]['content']

def get_result(sample):
    return sample[2]['content'][-1]['content']

def uniform_latex_format(text):
    text = text.replace('\\[', '$$').replace('\\]', '$$')
    text = text.replace('\\(', '$').replace('\\)', '$')
    return text

### Preprocess

In [None]:
data = read_jsonl('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/train_80k.jsonl')
len(data)

In [None]:
rows = []
for sample in tqdm(data):
    rows.append(dict(problem=get_problem(sample), solution=format_answer(sample), answer=get_result(sample)))
df = pd.DataFrame(rows)

In [None]:
df.head()

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/train_80k.csv', index=False)

In [None]:
df.answer = df.answer.apply(text_to_int_answer)
df = df[~df.answer.isna()]
df.answer = df.answer.astype(int)
print(len(df))
df.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/train_80k_int_answers.csv', index=False)

Let's try to associate this with the MATH dataset.

In [None]:
math_train = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_train.csv')
math_train.drop_duplicates('problem', inplace=True)
math_test = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_test.csv')
math_train.head()

In [None]:
df_subset = df[df.problem.isin(set(math_test.problem.values))]
len(df_subset), len(df_subset.problem.unique()), len(math_test)

One problem of the test set is present in the dataset, that is weird.

In [None]:
df_subset = df[df.problem.isin(set(math_train.problem.values))]
len(df_subset), len(df_subset.problem.unique()), len(math_train)

We have solutions for 3659/4354 train math problems. On average we have around 3 solutions for each problem.

In [None]:
df_math = df[df.problem.isin(set(math_train.problem.values))]

In [None]:
math_train
math_train.set_index('problem', inplace=True)
math_train.head()

In [None]:
for key in ['level', 'type', 'source', 'stage']:
    df_math[key] = math_train.loc[df_math.problem.values][key].values
df_math.head()

In [None]:
df_math['input_tokens'] = df_math.problem.apply(lambda x: len(tokenizer.tokenize(x)))
df_math['output_tokens'] = df_math.solution.apply(lambda x: len(tokenizer.tokenize(x)))
df_math['total_tokens'] = df_math['input_tokens'] + df_math['output_tokens']

In [None]:
df_math['id'] = df_math.index
df_math.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATH.csv')

Let's remove the duplicates, keeping the one with the shortest answer.

In [None]:
df_math.sort_values('total_tokens', inplace=True)

In [None]:
plt.hist(df_math.total_tokens.values, bins=np.linspace(0, 5000, 100));

In [None]:
df_math.drop_duplicates('problem', inplace=True)
plt.hist(df_math.total_tokens.values, bins=np.linspace(0, 5000, 100));

In [None]:
df_math.to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATH_no_duplicates.csv')

In [None]:
df_math[df_math.total_tokens < 1000].to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATH_no_duplicates_less1000tokens.csv')

In [None]:
df_math[df_math.total_tokens < 1000].to_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATHCodeInstruct_curated.csv')

### Exploration

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATH_no_duplicates_less1000tokens.csv')
df.sort_values(['level', 'type'], inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.shape)

In [None]:
df.head()

In [None]:
df.level.value_counts()

In [None]:
df.type.value_counts()

In [None]:
bins = np.arange(0, 2000)
for key in ['input_tokens', 'output_tokens']:
    for level in sorted(df.level.unique())[:-1]:
        plt.hist(df[df.level == level][key], bins=bins, alpha=0.5, label=level, density=True, cumulative=1)
    if key == 'input_tokens':
        plt.xlim(0, 200)
    else:
        plt.xlim(0, 1000)
    plt.ylim(0, 1)
    plt.grid()
    plt.legend(loc=0);
    plt.title(f'{key} distribution')
    plt.xlabel('Tokens')
    plt.ylabel('Cumulative probability')
    plt.show()

In [None]:
render_problem(3220)

- There are many responses for each problem
- Some problems do not have answer (maybe the difficult ones?)

- [ ] Are there problems from the test set?
- [ ] Is my method missing some problems?

## TODO

- [x] Focus on problems that have integer and non-negative answers
- [x] What is the distribution of output and input tokens?
- [x] What if I create smaller versions of the test set?
- [x] I might also filter long problems?