In [None]:
import pandas as pd


path = "reasoning_traces/aime_1983_2023_qwq-32b_traces_32768.csv"
df = pd.read_csv(path)
df.sort_values(by=["Year", "Part", "Problem Number"], inplace=True)
df

In [None]:
from transformers import AutoTokenizer

qwq_tokenizer = AutoTokenizer.from_pretrained("Qwen/QwQ-32b")

qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")


# Compute the number of tokens per row
def count_tokens(text):
    """Count the number of tokens in a text string using the loaded tokenizer."""
    return len(qwen_tokenizer.encode(text))


def apply_s1_template(question, reasoning, attempt):
    question = question.strip()
    question += (
        "\n\nPlease reason step by step, and put your final answer within \\boxed{}."
    )

    text = qwen_tokenizer.apply_chat_template(
        [
            {"role": "user", "content": question.strip()},
            {
                "role": "assistant",
                "content": "<|im_start|>think\n"
                + reasoning.strip()
                + "\n<|im_start|>answer\n"
                + attempt.strip(),
            },
        ],
        tokenize=False,
    )
    return text


def apply_r1_template(question, reasoning, attempt):
    question = question.strip()
    question += (
        "\n\nPlease reason step by step, and put your final answer within \\boxed{}."
    )

    text = qwq_tokenizer.apply_chat_template(
        [
            {"role": "user", "content": question.strip()},
            {
                "role": "assistant",
                "content": "<think>\n"
                + reasoning.strip()
                + "\n</think>\n"
                + attempt.strip(),
            },
        ],
        tokenize=False,
    )
    return text


# Apply chat template to all rows in the dataframe
df["templated_response"] = df.apply(
    lambda row: apply_s1_template(
        row["Question"],
        row["Reasoning"],
        row["Solution Attempt"],
    ),
    axis=1,
)


df["Token Count"] = df["templated_response"].apply(count_tokens)

df

In [None]:
print(df.iloc[0]["templated_response"])

In [None]:
# Plot the distribution of token count ratios
import matplotlib.pyplot as plt
import seaborn as sns


# Display some statistics about token counts
print("\nToken count statistics:")
print(f"Min: {df['Token Count'].min()}")
print(f"Max: {df['Token Count'].max()}")
print(f"Mean: {df['Token Count'].mean():.2f}")
print(f"Median: {df['Token Count'].median():.2f}")


plt.figure(figsize=(10, 6))
sns.histplot(df["Token Count"], bins=30, kde=True)
plt.title("Distribution of Token Count")
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.axvline(
    df["Token Count"].mean(),
    color="r",
    linestyle="--",
    label=f"Mean: {df['Token Count'].mean():.3f}",
)
plt.axvline(
    df["Token Count"].median(),
    color="g",
    linestyle="--",
    label=f"Median: {df['Token Count'].median():.3f}",
)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


count_exceeding_limit = len(df[df["Token Count"] >= 32768 / 2])
print(f"Number of rows with Reasoning Token Count > 32768: {count_exceeding_limit}")

# Calculate percentage of rows exceeding the token limit
percentage_exceeding = (count_exceeding_limit / len(df)) * 100
print(f"Percentage of rows exceeding token limit: {percentage_exceeding:.2f}%")


count_incorrect = len(df[df["Correct"] == False])
print(f"Number of incorrect rows: {count_incorrect}")

percentage_incorrect = (count_incorrect / len(df)) * 100
print(f"Percentage of incorrect rows: {percentage_incorrect:.2f}%")


count_missing_box = len(df[~df["Solution Attempt"].str.contains("boxed")])
print(f"Number of missing box rows: {count_missing_box}")

percentage_missing_box = (count_missing_box / len(df)) * 100
print(f"Percentage of missing box rows: {percentage_missing_box:.2f}%")


In [None]:
# Filter out rows that exceed the token limit, are incorrect, or missing the boxed answer

df = df[df["Token Count"] < 32768]
df = df[df["Correct"] == True]
df = df[df["Solution Attempt"].str.contains("boxed")]
df

In [None]:
# Push the dataframe to the Hugging Face Hub in the train split
from huggingface_hub import HfApi
import os
from tempfile import NamedTemporaryFile
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df, preserve_index=False)

# Create a DatasetDict with a train split
from datasets import DatasetDict

dataset_dict = DatasetDict({"train": hf_dataset})

# Save the dataset to a temporary directory
with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
    df.to_csv(temp_file.name, index=False)
    temp_file_path = temp_file.name

# Initialize the Hugging Face API client
api = HfApi()

# Get the API token from environment variable
token = os.environ.get("HUGGINGFACE_CLI")

# Define repository details
repo_id = "jonathanyin/" + path.split("/")[-1].split(".csv")[0]
repo_type = "dataset"

# Push the dataset to the Hub
dataset_dict.push_to_hub(repo_id=repo_id, token=token)

# Clean up the temporary file
os.unlink(temp_file_path)

print(f"Successfully pushed dataframe to {repo_id} in the train split")
