# Compare Base vs Baseline vs Optimized on FineTome

Evaluate 3 models on FineTome test set (response-only, matching training setup):
1. Base Llama 3.2 1B (no fine-tuning, random LoRA)
2. Baseline fine-tuned (r=16, lr=2e-4, no hyperparameter tuning)
3. Optimized fine-tuned (best from grid search)

In [3]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load and Prepare FineTome Test Dataset

In [4]:
from torch.utils.data import DataLoader
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer
import torch
import gc

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading FineTome-100k dataset...")
dataset = load_dataset("mlabonne/FineTome-100k", split="train")
print(f"Full dataset: {len(dataset)} examples")

# Standardize ShareGPT format
dataset = standardize_sharegpt(dataset)

# Same split as training (80/10/10 with seed=42)
train_test = dataset.train_test_split(test_size=0.2, seed=42)
temp = train_test['test']
val_test = temp.train_test_split(test_size=0.5, seed=42)
test_dataset_raw = val_test['test']

print(f"Test set: {len(test_dataset_raw)} examples")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Loading FineTome-100k dataset...


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Full dataset: 100000 examples


Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

Test set: 10000 examples


## Evaluation Function

In [5]:
def evaluate_model(model, tokenizer, test_data, model_name):
    """
    Evaluate model on test set.
    Masks instruction tokens, only computes loss on assistant (gpt) responses.
    """
    print(f"\n{'='*60}")
    print(f"EVALUATING: {model_name}")
    print(f"{'='*60}")

    model.eval()

    # Format with chat template
    def formatting_func(examples):
        convos = examples["conversations"]
        texts = [
            tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
            for convo in convos
        ]
        return {"text": texts}

    test_formatted = test_data.map(formatting_func, batched=True)

    # Tokenize
    def tokenize_func(examples):
        result = tokenizer(examples["text"], truncation=True, max_length=max_seq_length)
        result["labels"] = result["input_ids"].copy()
        return result

    test_tokenized = test_formatted.map(tokenize_func, batched=True, remove_columns=test_formatted.column_names)

    # Create DataLoader
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
    dataloader = DataLoader(test_tokenized, batch_size=4, collate_fn=data_collator)

    # Find response start token for masking
    response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_token_ids = tokenizer.encode(response_template, add_special_tokens=False)

    total_loss = 0
    total_response_tokens = 0
    num_batches = 0

    print(f"Evaluating on {len(test_tokenized)} samples...")

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to('cuda')
            labels = batch['labels'].clone()

            # Mask instruction tokens (set to -100)
            for i in range(labels.size(0)):
                seq = input_ids[i].tolist()

                # Find where assistant response starts
                response_start = -1
                for j in range(len(seq) - len(response_token_ids)):
                    if seq[j:j+len(response_token_ids)] == response_token_ids:
                        response_start = j + len(response_token_ids)
                        break

                # Mask everything before response
                if response_start > 0:
                    labels[i, :response_start] = -100
                    total_response_tokens += (labels[i] != -100).sum().item()
                else:
                    labels[i, :] = -100

            labels = labels.to('cuda')
            outputs = model(input_ids=input_ids, labels=labels)

            if not torch.isnan(outputs.loss):
                total_loss += outputs.loss.item()
                num_batches += 1

    avg_loss = total_loss / num_batches
    perplexity = torch.exp(torch.tensor(avg_loss)).item()

    print(f"\n>>> {model_name}:")
    print(f"    Loss: {avg_loss:.4f}")
    print(f"    Perplexity: {perplexity:.2f}")

    return {'loss': avg_loss, 'perplexity': perplexity}

## 1. Evaluate Base Model (Random LoRA)

In [None]:
print("Loading Base Llama 3.2 1B...")
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Add random LoRA (required for quantized model with trainer)
base_model = FastLanguageModel.get_peft_model(
    base_model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

base_tokenizer = get_chat_template(base_tokenizer, chat_template="llama-3.1")

base_results = evaluate_model(base_model, base_tokenizer, test_dataset_raw, "Base Llama 3.2 1B (random LoRA)")

# Cleanup
del base_model, base_tokenizer
gc.collect()
torch.cuda.empty_cache()
print("\nâœ“ Memory cleared")

Loading Base Llama 3.2 1B...
==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.11.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.



EVALUATING: Base Llama 3.2 1B (random LoRA)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Evaluating on 10000 samples...

>>> Base Llama 3.2 1B (random LoRA):
    Loss: 1.2697
    Perplexity: 3.56

âœ“ Memory cleared


## 2. Evaluate Baseline Fine-tuned Model

In [None]:
print("Loading Baseline Fine-tuned Model...")
baseline_model, baseline_tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/lab2_models_v2/baseline_lora",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

baseline_tokenizer = get_chat_template(baseline_tokenizer, chat_template="llama-3.1")

baseline_results = evaluate_model(baseline_model, baseline_tokenizer, test_dataset_raw, "Baseline Fine-tuned (r=16, lr=2e-4)")

# Cleanup
del baseline_model, baseline_tokenizer
gc.collect()
torch.cuda.empty_cache()
print("\nâœ“ Memory cleared")

Loading Baseline Fine-tuned Model...
==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

EVALUATING: Baseline Fine-tuned (r=16, lr=2e-4)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Evaluating on 10000 samples...

>>> Baseline Fine-tuned (r=16, lr=2e-4):
    Loss: 0.9945
    Perplexity: 2.70

âœ“ Memory cleared


## 3. Evaluate Optimized Fine-tuned Model

In [6]:
print("Loading Optimized Fine-tuned Model...")
optimized_model, optimized_tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/lab2_models_v2/optimized_lora",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

optimized_tokenizer = get_chat_template(optimized_tokenizer, chat_template="llama-3.1")

optimized_results = evaluate_model(optimized_model, optimized_tokenizer, test_dataset_raw, "Optimized Fine-tuned (grid search best)")

# Cleanup
del optimized_model, optimized_tokenizer
gc.collect()
torch.cuda.empty_cache()
print("\nâœ“ Memory cleared")

Loading Optimized Fine-tuned Model...
==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Unsloth 2025.11.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.



EVALUATING: Optimized Fine-tuned (grid search best)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Evaluating on 10000 samples...

>>> Optimized Fine-tuned (grid search best):
    Loss: 0.9837
    Perplexity: 2.67

âœ“ Memory cleared


## Final Comparison

In [10]:
base_perp = 3.56
baseline_perp = 2.70
base_loss = 1.2697
baseline_loss = 0.9945

print("\n" + "="*70)
print("FINAL RESULTS - FineTome Test Set (Response-Only Evaluation)")
print("="*70)
print(f"\n{'Model':<40} {'Loss':<12} {'Perplexity':<12}")
print("-"*64)
print(f"{'Base Llama 3.2 1B (random LoRA)':<40} {base_loss:<12.4f} {base_perp:<12.2f}")
print(f"{'Baseline Fine-tuned':<40} {baseline_loss:<12.4f} {baseline_perp:<12.2f}")
print(f"{'Optimized Fine-tuned':<40} {optimized_results['loss']:<12.4f} {optimized_results['perplexity']:<12.2f}")
print("-"*64)

# Improvements
base_to_baseline = ((base_perp - baseline_perp) / base_perp) * 100
base_to_optimized = ((base_perp - optimized_results['perplexity']) / base_perp) * 100
baseline_to_optimized = ((baseline_perp - optimized_results['perplexity']) / baseline_perp) * 100

print(f"\nImprovements (perplexity reduction):")
print(f"  Base â†’ Baseline:  {base_to_baseline:.1f}%")
print(f"  Base â†’ Optimized: {base_to_optimized:.1f}%")
print(f"  Baseline â†’ Optimized: {baseline_to_optimized:.1f}%")

print("\n" + "="*70)



FINAL RESULTS - FineTome Test Set (Response-Only Evaluation)

Model                                    Loss         Perplexity  
----------------------------------------------------------------
Base Llama 3.2 1B (random LoRA)          1.2697       3.56        
Baseline Fine-tuned                      0.9945       2.70        
Optimized Fine-tuned                     0.9837       2.67        
----------------------------------------------------------------

Improvements (perplexity reduction):
  Base â†’ Baseline:  24.2%
  Base â†’ Optimized: 24.9%
  Baseline â†’ Optimized: 1.0%

