# Math Question Answer Verification Competition

## Built off provided Starter Code

Completed by: **The Dalai Llamas**

- Jayanth Rao - jr6594
- Adel del Valle - ad7082
- Joshua Alfred Jayapal - jj3811

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [None]:
# %%capture
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [22]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [40]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Load model and wrap with LoRA adapters

In [41]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

## Competition dataset

In [None]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

Shuffle the training data and create a validation set of 10000 rows -- equivalent to test set

In [None]:
from datasets import DatasetDict, concatenate_datasets

shuffled = dataset['train'].shuffle(seed=3407)
train_data = shuffled.select(range(990000))
validation_data = shuffled.select(range(990000, 1000000))

reformatted_data = DatasetDict({
    'train': train_data,
    'validation': validation_data,
    'test': dataset['test']
})
# validation_set = dataset['test']

In [None]:
reformatted_data['validation']

Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 10000
})

In [36]:
reshuffled_train_data = reformatted_data['train'].shuffle(seed=3407).shuffle(seed=3407)

We changed the prompt to use the provided explanation for solving the problem in the dataset.

In [37]:
simplified_prompt = """You are an extremely intelligent mathematician. Find the answer to the question provided and evaluate the answer based on the given explanation. If the the answer is correct based on the explanation, respond with 'True', otherwise respond with 'False'.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    explanation = examples["solution"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input, exp, output in zip(question, ans, explanation, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = simplified_prompt.format(instruction, input, exp, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }




In [38]:
# Process the training dataset and generate prompt for each datapoint

# train_dataset = dataset['train'].map(formatting_prompts_func, batched = True,)

train_dataset = reshuffled_train_data.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/990000 [00:00<?, ? examples/s]

## SFT

In [44]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 32,
        warmup_steps = 25,
        max_steps = 250,
        learning_rate = 1.2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)

Map (num_proc=4):   0%|          | 0/990000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [45]:
torch.cuda.empty_cache()

In [None]:
trainer_stats = trainer.train()

## inference

In [None]:
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")

In [None]:
# Running inference on single test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = simplified_prompt.format(
        sample_ques, # ques
        sample_ans, # given answer,
        sample_exp, # explanation
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

# print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

['False\n']

In [47]:
import csv
from tqdm import tqdm
import torch

# Set model to inference mode for efficiency
FastLanguageModel.for_inference(model)

correct_predictions = 0
total_examples = len(reformatted_data['validation'])
results = []

batch_size = 32

for start in tqdm(range(0, total_examples, batch_size)):
    end = min(start + batch_size, total_examples)
    batch_questions = reformatted_data['validation']['question'][start:end]
    batch_answers = reformatted_data['validation']['answer'][start:end]
    batch_solutions = reformatted_data['validation']['solution'][start:end]
    batch_labels = ["True" if label else "False" for label in reformatted_data['validation']['is_correct'][start:end]]

    batch_prompts = [
        simplified_prompt.format(q, a, s, "")
        for q, a, s in zip(batch_questions, batch_answers, batch_solutions)
    ]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")

    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    responses = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)

    for i, response in enumerate(responses):
        true_label = batch_labels[i]
        predicted_label = "True" if "True" in response else "False"
        is_correct = predicted_label == true_label

        if is_correct:
            correct_predictions += 1

        results.append((start + i, predicted_label))

    torch.cuda.empty_cache()

# Calculate accuracy
accuracy = correct_predictions / total_examples
print(f"Model Accuracy: {accuracy:.2%}")


100%|██████████| 313/313 [12:39<00:00,  2.43s/it]

Model Accuracy: 88.11%





In [48]:
import csv
from tqdm import tqdm
import torch

# Set model to inference mode for efficiency
FastLanguageModel.for_inference(model)

total_examples = len(reformatted_data['test'])
results = []

batch_size = 32

for start in tqdm(range(0, total_examples, batch_size)):
    end = min(start + batch_size, total_examples)
    batch_questions = reformatted_data['test']['question'][start:end]
    batch_answers = reformatted_data['test']['answer'][start:end]
    batch_solutions = reformatted_data['test']['solution'][start:end]
    batch_labels = ["True" if label else "False" for label in reformatted_data['test']['is_correct'][start:end]]

    batch_prompts = [
        simplified_prompt.format(q, a, s, "")
        for q, a, s in zip(batch_questions, batch_answers, batch_solutions)
    ]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")

    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    responses = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)

    for i, response in enumerate(responses):
        true_label = batch_labels[i]
        predicted_label = "True" if "True" in response else "False"
        is_correct = predicted_label == true_label

        results.append((start + i, predicted_label))

    torch.cuda.empty_cache()

# Write results to CSV once at the end
with open("results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "is_correct"])
    writer.writerows(results)

100%|██████████| 313/313 [10:12<00:00,  1.96s/it]


## saving model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
import shutil

# Move the results.csv file to your Google Drive
destination_path = "/content/drive/My Drive/Colab Notebooks/results.csv"
shutil.move("results.csv", destination_path)

'/content/drive/My Drive/Colab Notebooks/results.csv'

In [34]:
model.save_pretrained("lora_model_0.84510_sub9") # Local saving
tokenizer.save_pretrained("lora_model_0.84510_sub9")

('lora_model_0.84510_sub9/tokenizer_config.json',
 'lora_model_0.84510_sub9/special_tokens_map.json',
 'lora_model_0.84510_sub9/tokenizer.json')

In [35]:
!zip -r /content/lora_model_0.84510_sub9.zip /content/lora_model_0.84510_sub9

  adding: content/lora_model_0.84510_sub9/ (stored 0%)
  adding: content/lora_model_0.84510_sub9/tokenizer.json (deflated 85%)
  adding: content/lora_model_0.84510_sub9/special_tokens_map.json (deflated 71%)
  adding: content/lora_model_0.84510_sub9/adapter_model.safetensors (deflated 8%)
  adding: content/lora_model_0.84510_sub9/adapter_config.json (deflated 54%)
  adding: content/lora_model_0.84510_sub9/tokenizer_config.json (deflated 96%)
  adding: content/lora_model_0.84510_sub9/README.md (deflated 66%)


In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
