# Math Question Answer Verification Competition

## Starter Code

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [None]:
# %%capture
# This cell will take time
!pip install unsloth



In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Load model and wrap with LoRA adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

## Competition dataset

In [None]:
# download and load competition dataset
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [None]:
prompt = """You are a skilled mathematician responsible for determining the correctness of answers to math questions. Carefully analyze the question, answer, and explanation provided. Respond with 'True' if the answer is correct, otherwise 'False'.
Here are a few examples:

Question: What is the sum of 2 and 3?
Answer: 5
Output: True

Question: What is the product of 4 and 5?
Answer: 25
Output: False

Question: What is the difference between 10 and 3?
Answer: 6
Output: False


### Question:
{}

### Answer:
{}

### Explainaition

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question, ans, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }




In [None]:
# Process the training dataset and generate prompt for each datapoint
from datasets import concatenate_datasets, Dataset
import random
test_dataset = dataset['test']



train_dataset = dataset['train']


true_samples = train_dataset.filter(lambda x: x["is_correct"] == True)
false_samples = train_dataset.filter(lambda x: x["is_correct"] == False)


target_size_per_class = 20000  


true_samples_subset = true_samples.shuffle(seed=42).select(range(target_size_per_class))
false_samples_subset = false_samples.shuffle(seed=42).select(range(target_size_per_class))


balanced_train_dataset = concatenate_datasets([true_samples_subset, false_samples_subset])


balanced_train_dataset = balanced_train_dataset.shuffle(seed=42)


print(f"Total samples: {len(balanced_train_dataset)}")
print(f"True samples: {sum(balanced_train_dataset['is_correct'])}")
print(f"False samples: {len(balanced_train_dataset) - sum(balanced_train_dataset['is_correct'])}")


balanced_train_dataset = balanced_train_dataset.map(formatting_prompts_func, batched=True)


print(balanced_train_dataset['text'][:5])


Total samples: 40000
True samples: 20000
False samples: 20000
["You are a skilled mathematician responsible for determining the correctness of answers to math questions. Carefully analyze the question, answer, and explanation provided. Respond with 'True' if the answer is correct, otherwise 'False'.\nHere are a few examples:\n\nQuestion: What is the sum of 2 and 3?\nAnswer: 5\nOutput: True\n\nQuestion: What is the product of 4 and 5?\nAnswer: 25\nOutput: False\n\nQuestion: What is the difference between 10 and 3?\nAnswer: 6\nOutput: False\n\n\n### Question:\nFind $\\left(\\frac{1}{2}\\right)^{8} \\cdot \\left(\\frac{3}{4}\\right)^{-3}$.\n\n### Answer:\n0.009259\n\n### Explainaition\n\n### Output:\nFalse<|end_of_text|>", "You are a skilled mathematician responsible for determining the correctness of answers to math questions. Carefully analyze the question, answer, and explanation provided. Respond with 'True' if the answer is correct, otherwise 'False'.\nHere are a few examples:\n\nQue

In [None]:
train_dataset=balanced_train_dataset

In [None]:
#print a smaple training example
train_dataset['text'][0]

"You are a skilled mathematician responsible for determining the correctness of answers to math questions. Carefully analyze the question, answer, and explanation provided. Respond with 'True' if the answer is correct, otherwise 'False'.\nHere are a few examples:\n\nQuestion: What is the sum of 2 and 3?\nAnswer: 5\nOutput: True\n\nQuestion: What is the product of 4 and 5?\nAnswer: 25\nOutput: False\n\nQuestion: What is the difference between 10 and 3?\nAnswer: 6\nOutput: False\n\n\n### Question:\nFind $\\left(\\frac{1}{2}\\right)^{8} \\cdot \\left(\\frac{3}{4}\\right)^{-3}$.\n\n### Answer:\n0.009259\n\n### Explainaition\n\n### Output:\nFalse<|end_of_text|>"

In [None]:

train_dataset

Dataset({
    features: ['question', 'is_correct', 'answer', 'solution', 'text'],
    num_rows: 40000
})

In [None]:
print(train_dataset)

Dataset({
    features: ['question', 'is_correct', 'answer', 'solution', 'text'],
    num_rows: 40000
})


## SFT

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback


training_args = TrainingArguments(
    per_device_train_batch_size=8,  # Increase batch size
    gradient_accumulation_steps=8,  # Increseumber of gradient accumulation steps
    warmup_steps=100,            # Increse worm-up steps
    max_steps=1000,          # Increase the number of training steps     
    learning_rate=2e-5,            # Decrease learning rate to improve stability
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.005,   # Decrease weight decay        
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="none",
)



trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,
    packing=False,  
    args=training_args,
)


Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 8
\        /    Total batch size = 64 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss
10,1.6988
20,1.6188
30,1.3493
40,0.8763
50,0.5967
60,0.5264
70,0.5219
80,0.506
90,0.4785
100,0.4844


In [None]:
from sklearn.metrics import accuracy_score
import pandas as pd
import random


sample_size = 500
pre_test_dataset = train_dataset.shuffle(seed=42).select(range(sample_size))


FastLanguageModel.for_inference(model)


pre_test_results = []


for idx, (question, answer, true_label) in enumerate(zip(pre_test_dataset['question'], pre_test_dataset['answer'], pre_test_dataset['is_correct'])):

    input_prompt = prompt.format(question, answer, "")


    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")


    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)


    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)[0].strip()


    predicted_label = response.lower() == 'true'


    pre_test_results.append({
        "ID": idx,
        "true_label": true_label,
        "predicted_label": predicted_label
    })


pre_test_results_df = pd.DataFrame(pre_test_results)


accuracy = accuracy_score(pre_test_results_df["true_label"], pre_test_results_df["predicted_label"])
print(f"Pre-test accuracy: {accuracy * 100:.2f}%")


Pre-test accuracy: 85.40%


## inference

In [None]:
import pandas as pd
import os
from tqdm import tqdm  

# Enable inference mode
FastLanguageModel.for_inference(model)

# Create an empty list to store all prediction results
results = []


for idx, (question, answer) in tqdm(enumerate(zip(test_dataset['question'], test_dataset['answer'])), total=len(test_dataset['question']), desc="Processing Inference"):
    
    input_prompt = prompt.format(question, answer, "")


    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")

 
    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)


    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)[0].strip()

  
    is_correct = response.lower() == 'true'


    results.append({"ID": idx, "is_correct": is_correct})


results_df = pd.DataFrame(results)
file_path = "submission.csv"
results_df.to_csv(file_path, index=False)

absolute_path = os.path.abspath(file_path)
print(f"Results saved to {absolute_path}")


Processing Inference: 100%|██████████| 10000/10000 [33:23<00:00,  4.99it/s]


Results saved to /content/submission.csv


## saving model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
import pandas as pd


file_path = "/content/submission.csv"


test_dataset = pd.read_csv(file_path)



print("Distribution：")
print(test_dataset['is_correct'].value_counts(normalize=True))
test_dataset


测试集类别分布：
is_correct
False    0.6072
True     0.3928
Name: proportion, dtype: float64
