# Compare Qwen FineTome vs Code Documentation Model

Evaluate both Qwen models on the code generation test set.

In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Qwen Code Documentation Model

In [14]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch
import gc

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading Qwen Code Documentation Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/lab2_models/qwen_code_doc_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

tokenizer = get_chat_template(tokenizer, chat_template="chatml")
print("✓ Qwen Code Doc Model loaded")

Loading Qwen Code Documentation Model...
==((====))==  Unsloth 2025.11.6: Fast Qwen2 patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✓ Qwen Code Doc Model loaded


## Load and Prepare Test Dataset

In [15]:
print("Loading dataset...")
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
print(f"Full dataset: {len(dataset)} examples")

# Same split as training (80/10/10 with seed=42)
train_test = dataset.train_test_split(test_size=0.2, seed=42)
temp = train_test['test']
val_test = temp.train_test_split(test_size=0.5, seed=42)
test_dataset_raw = val_test['test']

print(f"Test set: {len(test_dataset_raw)} examples")

# Use subset for speed
test_sample = test_dataset_raw.select(range(min(2000, len(test_dataset_raw))))
print(f"Using {len(test_sample)} examples for evaluation")

Loading dataset...
Full dataset: 18612 examples
Test set: 1862 examples
Using 1862 examples for evaluation


## Format Test Dataset

In [16]:
def format_example(example):
    instruction = example['instruction']
    output = example['output']

    if 'input' in example and example['input']:
        instruction = f"{instruction}\n\n{example['input']}"

    conversation = [
        {"role": "user", "content": instruction},
        {"role": "assistant", "content": output}
    ]

    text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
    return {"text": text}

test_dataset = test_sample.map(format_example, remove_columns=test_sample.column_names)
print(f"✓ Test set formatted")
print(f"\nExample:\n{test_dataset[0]['text'][:500]}...")

Map:   0%|          | 0/1862 [00:00<?, ? examples/s]

✓ Test set formatted

Example:
<|im_start|>user
Use a genetic algorithm to optimize a simple OOP program written in Python with two classes, Student and Teacher.

import random

class Teacher:
 def __init__(self):
 self.rating = random.random()

class Student:
 def __init__(self):
 self.gpa = random.random()

teacher = Teacher()
student = Student()<|im_end|>
<|im_start|>assistant
import random
import numpy as np

from deap import base
from deap import creator
from deap import tools

# Create a new type with a fitness attribut...


## Tokenize and Create DataLoader

In [17]:
def tokenize_for_eval(examples):
    result = tokenizer(examples["text"], truncation=True, max_length=max_seq_length)
    result["labels"] = result["input_ids"].copy()
    return result

test_tokenized = test_dataset.map(tokenize_for_eval, batched=True, remove_columns=["text"])

test_dataloader = DataLoader(
    test_tokenized,
    batch_size=4,
    collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

print(f"✓ DataLoader created with {len(test_dataloader)} batches")

Map:   0%|          | 0/1862 [00:00<?, ? examples/s]

✓ DataLoader created with 466 batches


## Evaluate Qwen Code Documentation Model

In [18]:
print("="*80)
print("EVALUATING QWEN CODE DOCUMENTATION MODEL")
print("="*80)

FastLanguageModel.for_inference(model)
model.eval()

total_loss = 0
num_batches = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        labels = batch['labels'].to('cuda')

        outputs = model(input_ids=input_ids, labels=labels)
        total_loss += outputs.loss.item()
        num_batches += 1

code_doc_loss = total_loss / num_batches
code_doc_perplexity = torch.exp(torch.tensor(code_doc_loss))

print(f"\n✓ Qwen Code Doc: Loss = {code_doc_loss:.4f}, Perplexity = {code_doc_perplexity:.2f}")

# Cleanup
del model
gc.collect()
torch.cuda.empty_cache()
print("✓ Memory cleared")

EVALUATING QWEN CODE DOCUMENTATION MODEL

✓ Qwen Code Doc: Loss = 0.8141, Perplexity = 2.26
✓ Memory cleared


## Load and Evaluate Qwen FineTome Model

In [19]:
print("\n" + "="*80)
print("LOADING AND EVALUATING QWEN FINETOME MODEL")
print("="*80)

print("\nLoading Qwen FineTome Model...")
finetome_model, finetome_tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/lab2_models/qwen_finetome_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

finetome_tokenizer = get_chat_template(finetome_tokenizer, chat_template="chatml")
FastLanguageModel.for_inference(finetome_model)
finetome_model.eval()

print("Evaluating Qwen FineTome Model...")
total_loss = 0
num_batches = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        labels = batch['labels'].to('cuda')

        outputs = finetome_model(input_ids=input_ids, labels=labels)
        total_loss += outputs.loss.item()
        num_batches += 1

finetome_loss = total_loss / num_batches
finetome_perplexity = torch.exp(torch.tensor(finetome_loss))

print(f"\n✓ Qwen FineTome: Loss = {finetome_loss:.4f}, Perplexity = {finetome_perplexity:.2f}")


LOADING AND EVALUATING QWEN FINETOME MODEL

Loading Qwen FineTome Model...
==((====))==  Unsloth 2025.11.6: Fast Qwen2 patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Evaluating Qwen FineTome Model...

✓ Qwen FineTome: Loss = 0.9378, Perplexity = 2.55


## Final Results

In [20]:
print("\n" + "="*80)
print("RESULTS")
print("="*80)
print(f"\n{'Model':<25} {'Loss':<12} {'Perplexity':<12}")
print("-"*49)
print(f"{'Qwen Code Documentation':<25} {code_doc_loss:<12.4f} {code_doc_perplexity:<12.2f}")
print(f"{'Qwen FineTome':<25} {finetome_loss:<12.4f} {finetome_perplexity:<12.2f}")
print("-"*49)

if code_doc_loss < finetome_loss:
    improvement = ((finetome_loss - code_doc_loss) / finetome_loss) * 100
    print(f"\n✅ Code Doc is {improvement:.1f}% better on code tasks!")
else:
    difference = ((code_doc_loss - finetome_loss) / code_doc_loss) * 100
    print(f"\n⚠️ FineTome is {difference:.1f}% better (unexpected)")


RESULTS

Model                     Loss         Perplexity  
-------------------------------------------------
Qwen Code Documentation   0.8141       2.26        
Qwen FineTome             0.9378       2.55        
-------------------------------------------------

✅ Code Doc is 13.2% better on code tasks!
