In [None]:
%%capture
# Suppresses output of this cell.

# Install the Unsloth library.
!pip install unsloth

# Uninstall any existing version and install the latest version of Unsloth from GitHub.
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel       # For loading and using language models.
import torch                                # PyTorch library for ML tasks.
from sklearn.metrics import accuracy_score  # To calculate accuracy.

max_seq_length = 2048                       # Max sequence length for the model.
dtype = None                                # Auto-detect data type; adjust for specific GPUs if needed.
load_in_4bit = True                         # Enable 4-bit quantization to save memory.

Importing the model

In [None]:
# Load the pre-trained Llama3-8B model and tokenizer with specified configurations.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",   # Specifies the model name.
    max_seq_length=max_seq_length,            # Sets the maximum sequence length.
    dtype=dtype,                              # Determines data type for computations.
    load_in_4bit=load_in_4bit,                # Enables 4-bit quantization if True.
)

# Set the tokenizer to add padding on the left side of sequences.
tokenizer.padding_side = "left"

# Move the model to the GPU for faster computation.
model = model.to("cuda")

In [None]:
# Apply Parameter-Efficient Fine-Tuning (PEFT) to the model.
model = FastLanguageModel.get_peft_model(
    model,
    r=128,      # Rank of LoRA updates; higher values increase expressiveness but use more memory.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",   # Modules for applying LoRA.
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=128,                         # Scaling factor for LoRA updates.
    lora_dropout=0,                         # Dropout rate for LoRA layers; 0 is optimized for performance.
    bias="none",                            # Bias handling; "none" minimizes memory usage and is optimized.
    use_gradient_checkpointing="unsloth",   # Saves memory for long contexts; "unsloth" is memory-efficient.
    random_state=3407,                      # Sets a random seed for reproducibility.
    use_rslora=False,                       # Disables rank-stabilized LoRA; can be enabled if needed.
    loftq_config=None,                      # Disables LoftQ quantization; set if quantization is required.
)


Importing the dateset

In [None]:
# Download and load the competition dataset from the Hugging Face datasets library.
from datasets import load_dataset

dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")  # Load the specified dataset.

# Display the dataset details, such as its structure and content.
dataset

In [None]:
# Define a template for formatting the prompts to include problems, solutions, answers, and outputs.
prompt = """### Example:

Problem:
Find the value of y if 4y - 7 = 9.

Solution:
Adding 7 to both sides, we get 4y = 16. Then, dividing by 4, we find y = 4.

Answer:
4

Output:
True

Now, evaluate the following problem:

### Problem:
{}

### Solution:
{}

### Answer:
{}

### Output:
{}"""

# Define the End of Sequence (EOS) token to signal the end of each prompt.
EOS_TOKEN = tokenizer.eos_token

# Function to format prompts using the provided template.
def formatting_prompts_func(examples):
    question = examples["question"]       # The math problem to evaluate.
    ans = examples["answer"]              # The given answer to the problem.
    output = examples["is_correct"]       # Whether the given answer is correct or not.
    explaination = examples["solution"]   # Step-by-step solution to the problem.

    texts = []  # List to store formatted texts.
    for instruction, input_explaination, input, output in zip(question, explaination, ans, output):
        # Format the text using the template and add the EOS token to prevent infinite generation.
        text = prompt.format(instruction, input_explaination, input, output) + EOS_TOKEN
        texts.append(text)

    # Return the formatted texts as a dictionary.
    return {"text": texts}

Train:Valid Split

In [None]:
from sklearn.metrics import accuracy_score  # To evaluate model performance.

# Randomly shuffle and sample 110,000 rows from the training dataset.
sampled_data = dataset['train'].shuffle(seed=3407).select(range(110000))

# Split the sampled data into 100,000 rows for training and 10,000 rows for testing.
split_data = sampled_data.train_test_split(test_size=10000, seed=3407)

# Assign the split data to training and validation datasets.
train_data = split_data['train']            # Training dataset with 100,000 rows.
val_dataset = split_data['test']            # Validation dataset with 10,000 rows.

# Apply the formatting function to prepare the training dataset for prompt-based learning.
train_dataset = train_data.map(formatting_prompts_func, batched=True)


In [None]:
#print a smaple training example
train_dataset['text'][0]

Supervised Fine-Tuning

In [None]:
from trl import SFTTrainer                  # Importing the trainer class for Supervised Fine-Tuning.
from transformers import TrainingArguments  # Importing the class to define training configurations.
from unsloth import is_bfloat16_supported   # Check if bfloat16 is supported for training.

# Define the training arguments specifying the configurations for model training.
training_args = TrainingArguments(
        per_device_train_batch_size=2,        # Batch size for training per device (GPU).
        gradient_accumulation_steps=4,        # Accumulate gradients over 4 steps to effectively increase batch size.
        warmup_steps=5,                       # Number of steps for learning rate warmup.
        max_steps=5000,                       # Maximum number of training steps.
        learning_rate=2e-5,                   # Learning rate for model optimization.
        fp16=not is_bfloat16_supported(),     # Use fp16 if bfloat16 is not supported.
        bf16=is_bfloat16_supported(),         # Use bf16 if supported.
        logging_steps=1,                      # Log training metrics every 1 step.
        optim="adamw_8bit",                   # Use 8-bit AdamW optimizer for reduced memory usage.
        weight_decay=0.001,                   # Weight decay to prevent overfitting.
        lr_scheduler_type="linear",           # Linear scheduler for learning rate decay.
        seed=3407,                            # Set random seed for reproducibility.
        output_dir="outputs",                 # Directory to store model outputs.
        report_to="none",                     # Disable logging to WandB or other platforms.
)

# Initialize the trainer with the provided model, tokenizer, dataset, and training configurations.
trainer = SFTTrainer(
    model=model,                              # Model to be trained.
    tokenizer=tokenizer,                      # Tokenizer for encoding the inputs.
    train_dataset=train_dataset,              # Training dataset.
    dataset_text_field="text",                # Field name for the text data in the dataset.
    max_seq_length=max_seq_length,            # Maximum sequence length for the inputs.
    dataset_num_proc=4,                       # Number of processors to use for dataset processing.
    packing=False,                            # Disable packing for better efficiency on longer sequences.
    args=training_args,                       # Pass the training configurations.
)

In [None]:
trainer_stats = trainer.train()               # Start the training process and store the training statistics.


Saving the model

In [None]:
model.save_pretrained("lora_model")           # Save the trained model to the local directory "lora_model".
tokenizer.save_pretrained("lora_model")       # Save the tokenizer to the same local directory.

In [None]:
!zip -r lora_model.zip lora_model             # Compress the "lora_model" directory into a ZIP file named "lora_model.zip".


In [None]:
from google.colab import files                # Import the necessary module for file downloads.
files.download("lora_model.zip")              # Trigger the download of the "lora_model.zip" file to your local machine.

# Inference

In [None]:
from unsloth import FastLanguageModel   # Import the FastLanguageModel class from the unsloth library.
import torch                            # Import PyTorch for tensor operations and model management.

max_seq_length = 2048                   # Set the maximum sequence length for tokenization (can be adjusted based on available resources).
dtype = None                            # Set the data type for model weights; None for automatic detection. (e.g., use Float16 for Tesla T4).
load_in_4bit = True                     # Enable 4-bit quantization to reduce memory usage (can be set to False if not required).

In [None]:
import zipfile  # Import the zipfile module to handle ZIP file extraction.

# Path to the .zip file
zip_file_path = "/content/lora_model.zip"             # Path to the ZIP file containing the saved model.

# Extract the .zip file
extracted_folder = "/content"                         # Specify the folder to extract the contents (default is "/content").
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:  # Open the ZIP file in read mode.
    zip_ref.extractall(extracted_folder)              # Extract all files to the specified folder.

print(f"Model extracted to: {extracted_folder}")      # Print a confirmation message with the folder location.


In [None]:
if True:  # This condition always evaluates to True, so the block will execute.
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model",              # Load the pre-trained model from the "lora_model" directory.
        max_seq_length = max_seq_length,        # Set the maximum sequence length for the model's inputs.
        dtype = dtype,                          # Use the previously defined data type for the model.
        load_in_4bit = load_in_4bit,            # Load the model with 4-bit quantization to save memory.
    )
    FastLanguageModel.for_inference(model)      # Optimize the model for inference, making it 2x faster.


In [None]:
# Define parameters
batch_size = 32                               # Adjust this value based on available GPU memory
test_dataset = dataset['test']                # Use the 'test' split of the dataset
num_samples = len(test_dataset['question'])   # Set dynamically based on dataset size

# Pre-generate all input prompts for the test samples
input_prompts = [
    prompt.format(ques, ans, sol, "")         # "" for the output, allowing the model to generate True/False
    for ques, ans, sol in zip(test_dataset['question'], test_dataset['answer'], test_dataset['solution'])
]

# Prepare storage for responses and true labels
responses = []                                        # List to store the generated responses
true_labels = test_dataset['is_correct']              # Assumes 'is_correct' column contains boolean values indicating correct/incorrect answers

# Process in batches
for i in range(0, num_samples, batch_size):           # Loop through the dataset in batches
    # Select batch of prompts
    batch_prompts = input_prompts[i:i + batch_size]   # Get the current batch of prompts

    # Tokenize and prepare inputs for the model, move to GPU
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")   # Tokenize and send to GPU
    input_shape = inputs['input_ids'].shape[1]                                        # The input token length for slicing output

    # Run inference
    outputs = model.generate(**inputs, max_new_tokens=5, use_cache=True)              # Generate predictions with the model

    # Decode each output starting from input length to capture only the generated part
    batch_responses = tokenizer.batch_decode(
        [output[input_shape:] for output in outputs], skip_special_tokens=True
    )  # Decode only the generated tokens, skipping special tokens

    # Store the batch responses
    responses.extend(batch_responses)               # Append the generated responses to the list

    print(f"Processed batch {i // batch_size + 1}/{(num_samples + batch_size - 1) // batch_size}")  # Print progress

print("Inference completed for all samples.")       # Indicate that all inference is done

# At this point, 'responses' contains all generated responses, and 'true_labels' has the true values

In [None]:
import pandas as pd  # Import pandas for DataFrame manipulation

# Convert generated responses to boolean values
# Assumes `responses` is a list of lists with each inner list containing a single response string
# Convert to boolean by checking if it's "True"
predictions = [resp.strip() == 'True' for resp in responses]  # Strip spaces and check if the response is 'True'

# 1) Calculate accuracy
accuracy = sum(pred == label for pred, label in zip(predictions, true_labels)) / len(predictions)
# Calculate the accuracy by comparing predictions with the true labels and dividing by the total number of samples
print(f"Accuracy: {accuracy:.2f}")      # Print the accuracy to 2 decimal places

# 2) Create a DataFrame and save to CSV
df_output = pd.DataFrame({
    'ID': range(len(predictions)),  # Create an ID column for each sample
    'is_correct': predictions       # Store the predictions in the 'is_correct' column
})

# Save to CSV in Colab or Kaggle
csv_path = 'predictions2.csv'                                       # Define the path where the CSV will be saved
df_output.to_csv(csv_path, index=False)                             # Save the DataFrame to a CSV file without including the index

print(f"CSV file saved as '{csv_path}' in the current directory.")  # Confirm the file is saved successfully
