In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
import re
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import random

# Step 1: Load Llama-3, 8B Model
max_seq_length = 2048  
load_in_4bit = True 
dtype = None  

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Adjust LoRA Parameters and Quantization strategies for better model efficiency and performance
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Adjusted for LoRA configuration, allowing for richer fine-tuning
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,  # Adjust lora_alpha for optimal performance
    lora_dropout=0.1,  # Introduce dropout to reduce overfitting
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=True,  # Enable rank-stabilized LoRA
    loftq_config=None
)

# Save the model after adjustment to avoid re-running the above process in case of errors
model.save_pretrained('outputs_adjusted')
tokenizer.save_pretrained('outputs_adjusted')

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.7 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


('outputs_adjusted/tokenizer_config.json',
 'outputs_adjusted/special_tokens_map.json',
 'outputs_adjusted/tokenizer.json')

In [4]:
# Step 2: Load and Preprocess Dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")

# Randomly sample 30K examples from the training dataset
train_dataset = dataset['train'].shuffle(seed=42).select(range(30000))  # Select 30K samples for training

# Use a prompt to Improve Model Understanding
prompt = """You are a great mathematician and your task is to determine if the answer to a given math question is correct or not. First, analyze the solution carefully, then decide if the answer is 'True' if correct, otherwise 'False'. Below is the Question, Answer, and Solution.

### Question:
{}

### Answer:
{}

### Solution:
{}

### Analysis:
Based on the solution provided above, your output should be:

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Add EOS token to ensure clear end of generated sequences

def formatting_prompts_func(examples):
    questions = examples["question"]
    answers = examples["answer"]
    solutions = examples["solution"]
    outputs = examples["is_correct"]
    texts = []
    for question, answer, solution, output in zip(questions, answers, solutions, outputs):
        text = prompt.format(question, answer, solution, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply prompt formatting to the training data to make use of the 'Solution' column
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

# Save the dataset after preprocessing to avoid repeating if errors occur
train_dataset.save_to_disk('train_dataset_preprocessed')

Saving the dataset (0/1 shards):   0%|          | 0/30000 [00:00<?, ? examples/s]

In [10]:
# Step 3: Set Training Arguments with Hyperparameter Tuning
training_args = TrainingArguments(
    per_device_train_batch_size=4,  
    gradient_accumulation_steps=2,  
    warmup_steps=50,  
    max_steps=1000,  
    learning_rate=2e-5,  
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=20, 
    optim="adamw_8bit",
    weight_decay=0.05, 
    lr_scheduler_type="cosine",  # Use a cosine scheduler for smoother learning rate decay
    seed=3407,
    output_dir="outputs",
    report_to="none"  # Remove additional evaluation and checkpoint parameters
)

In [11]:
# Step 4: Model Fine-Tuning
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,
    packing=False,  
    args=training_args
)

trainer.train()

# Save model weights after training to avoid loss of progress
model.save_pretrained('outputs_trained')
tokenizer.save_pretrained('outputs_trained')


Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
20,1.4048
40,0.8819
60,0.7342
80,0.6569
100,0.664
120,0.6729
140,0.6574
160,0.6302
180,0.629
200,0.631


('outputs_trained/tokenizer_config.json',
 'outputs_trained/special_tokens_map.json',
 'outputs_trained/tokenizer.json')

In [None]:
# Step 5: Inference
FastLanguageModel.for_inference(model)  # Enable fast inference before generating
is_correct_predictions = []
for i in tqdm(range(len(dataset['test']['question'])), desc="Inference Progress"):
    sample_ques = dataset['test']['question'][i]
    sample_ans = dataset['test']['answer'][i]
    sample_solution = dataset['test']['solution'][i]

    # Construct input prompt
    input_prompt = prompt.format(
        sample_ques,  # question
        sample_ans,  # answer
        sample_solution,  # solution
        ""  # leave output blank for model to generate
    )

    inputs = tokenizer([input_prompt], return_tensors="pt").to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1]

    # Generate model output
    outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)[0]

    # Extract True/False from response using regex
    match = re.search(r'\b(True|False)\b', response, re.IGNORECASE)
    if match:
        is_correct = match.group(0).lower() == 'true'
    else:
        is_correct = False  # Default if no valid output is found

    is_correct_predictions.append(is_correct)

# Save intermediate inference results to avoid loss in case of interruptions
pd.DataFrame({
    'ID': list(range(len(is_correct_predictions))),
    'is_correct': is_correct_predictions
}).to_csv('inference_results_partial.csv', index=False)


In [None]:
# Step 6: Generate CSV for Submission
submission_df = pd.DataFrame({
    'ID': list(range(len(is_correct_predictions))),
    'is_correct': is_correct_predictions
})

submission_df.to_csv('submission.csv', index=False)
print("CSV file 'submission.csv' has been created successfully.")