In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [None]:
args = {
    "temperature": 1.5,
}

In [None]:
from unsloth import FastVisionModel
from datasets import load_dataset
import torch
import gc
import numpy as np
import json
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from PIL import Image
import textwrap

def qualitative_comparison(base_model, base_tokenizer, finetuned_model, finetuned_tokenizer,
                          test_samples, instruction, args, num_examples=5):
    """
    Generate side-by-side comparison of model outputs
    """

    print(f"\n{'='*80}")
    print("QUALITATIVE COMPARISON: Base vs Fine-tuned Model")
    print(f"{'='*80}\n")

    # Prepare models for inference
    FastVisionModel.for_inference(base_model)
    FastVisionModel.for_inference(finetuned_model)

    results = []

    for idx in range(min(num_examples, len(test_samples))):
        sample = test_samples[idx]
        print(f"\nProcessing example {idx+1}/{num_examples}...")

        try:
            # Extract image and reference
            images = [c["image"] for c in sample["messages"][0]["content"] if c["type"] == "image"]
            instruction_text = [c["text"] for c in sample["messages"][0]["content"] if c["type"] == "text"][0]
            reference = sample["messages"][1]["content"][0]["text"].strip()

            # Prepare input messages
            messages = [
                {"role": "user", "content": [
                    {"type": "image"},
                    {"type": "text", "text": instruction_text}
                ]}
            ]

            # Generate with BASE model
            text = base_tokenizer.apply_chat_template(messages, add_generation_prompt=True)
            inputs = base_tokenizer(images[0], text, add_special_tokens=False, return_tensors="pt").to("cuda")

            with torch.no_grad():
                outputs = base_model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=args.get("temperature", 1.5),
                    min_p=0.1,
                    do_sample=True,
                    pad_token_id=base_tokenizer.eos_token_id
                )

            base_output = base_tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            ).strip()

            # Generate with FINE-TUNED model
            text = finetuned_tokenizer.apply_chat_template(messages, add_generation_prompt=True)
            inputs = finetuned_tokenizer(images[0], text, add_special_tokens=False, return_tensors="pt").to("cuda")

            with torch.no_grad():
                outputs = finetuned_model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=args.get("temperature", 1.5),
                    min_p=0.1,
                    do_sample=True,
                    pad_token_id=finetuned_tokenizer.eos_token_id
                )

            finetuned_output = finetuned_tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            ).strip()

            # Store results
            results.append({
                "image": images[0],
                "reference": reference,
                "base_output": base_output,
                "finetuned_output": finetuned_output
            })

            print(f"✓ Example {idx+1} processed")

        except Exception as e:
            print(f"⚠ Error on example {idx+1}: {e}")
            continue

    # Visualize results
    print("\nGenerating visualization...")
    visualize_comparisons(results)

    # Save text results
    save_qualitative_results(results)

    return results


def visualize_comparisons(results, output_dir="evaluation_results"):
    """Create visual comparison of results"""

    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True, parents=True)

    for idx, result in enumerate(results):
        fig = plt.figure(figsize=(16, 10))
        gs = fig.add_gridspec(3, 2, height_ratios=[2, 1, 1], hspace=0.4, wspace=0.3)

        # Title
        fig.suptitle(f'Qualitative Comparison - Example {idx+1}',
                    fontsize=16, fontweight='bold')

        # Display image (spans both columns)
        ax_img = fig.add_subplot(gs[0, :])
        if isinstance(result["image"], Image.Image):
            ax_img.imshow(result["image"])
        else:
            ax_img.imshow(Image.open(result["image"]))
        ax_img.axis('off')
        ax_img.set_title('Input Medical Image', fontsize=12, fontweight='bold', pad=10)

        # Reference text
        ax_ref = fig.add_subplot(gs[1, :])
        ax_ref.axis('off')
        wrapped_ref = textwrap.fill(result["reference"], width=120)
        ax_ref.text(0.5, 0.5, wrapped_ref,
                   ha='center', va='center', fontsize=10,
                   bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.3),
                   wrap=True)
        ax_ref.set_title(' Reference (Ground Truth)',
                        fontsize=11, fontweight='bold', loc='left')

        # Base model output
        ax_base = fig.add_subplot(gs[2, 0])
        ax_base.axis('off')
        wrapped_base = textwrap.fill(result["base_output"], width=60)
        ax_base.text(0.5, 0.5, wrapped_base,
                    ha='center', va='center', fontsize=9,
                    bbox=dict(boxstyle='round', facecolor='#FFE5E5', alpha=0.5),
                    wrap=True)
        ax_base.set_title(' Base Model Output',
                         fontsize=11, fontweight='bold', loc='left', color='#FF6B6B')

        # Fine-tuned model output
        ax_ft = fig.add_subplot(gs[2, 1])
        ax_ft.axis('off')
        wrapped_ft = textwrap.fill(result["finetuned_output"], width=60)
        ax_ft.text(0.5, 0.5, wrapped_ft,
                  ha='center', va='center', fontsize=9,
                  bbox=dict(boxstyle='round', facecolor='#E5F5F5', alpha=0.5),
                  wrap=True)
        ax_ft.set_title(' Fine-tuned Model Output',
                       fontsize=11, fontweight='bold', loc='left', color='#4ECDC4')

        # Save individual comparison
        save_path = output_path / f"qualitative_comparison_example_{idx+1}.png"
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✓ Saved: {save_path}")
        plt.show()
        plt.close()


def save_qualitative_results(results, output_dir="evaluation_results"):
    """Save text results to JSON and readable text file"""

    output_path = Path(output_dir)

    # Save as JSON
    json_path = output_path / "qualitative_results.json"
    # Convert PIL images to paths for JSON serialization
    json_results = []
    for r in results:
        json_results.append({
            "reference": r["reference"],
            "base_output": r["base_output"],
            "finetuned_output": r["finetuned_output"]
        })

    with open(json_path, 'w') as f:
        json.dump(json_results, f, indent=2)
    print(f"\n✓ JSON results saved: {json_path}")

    # Save as readable text file
    txt_path = output_path / "qualitative_results.txt"
    with open(txt_path, 'w') as f:
        f.write("="*80 + "\n")
        f.write("QUALITATIVE COMPARISON RESULTS\n")
        f.write("="*80 + "\n\n")

        for idx, result in enumerate(results):
            f.write(f"\n{'='*80}\n")
            f.write(f"EXAMPLE {idx+1}\n")
            f.write(f"{'='*80}\n\n")

            f.write(" REFERENCE (Ground Truth):\n")
            f.write("-" * 80 + "\n")
            f.write(textwrap.fill(result["reference"], width=80) + "\n\n")

            f.write(" BASE MODEL OUTPUT:\n")
            f.write("-" * 80 + "\n")
            f.write(textwrap.fill(result["base_output"], width=80) + "\n\n")

            f.write(" FINE-TUNED MODEL OUTPUT:\n")
            f.write("-" * 80 + "\n")
            f.write(textwrap.fill(result["finetuned_output"], width=80) + "\n\n")

    print(f"✓ Text results saved: {txt_path}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:

print("\n" + "="*80)
print("STEP 4: Qualitative Comparison on Sample Images")
print("="*80)

# Load both models again (if you cleaned them up earlier)
base_model, base_tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit=True,
)

finetuned_model, finetuned_tokenizer = FastVisionModel.from_pretrained(
    "Laya-hmkh/llama-vision-radiology-checkpoint-300",
    load_in_4bit=True,
)

# Run qualitative comparison
qualitative_results = qualitative_comparison(
    base_model=base_model,
    base_tokenizer=base_tokenizer,
    finetuned_model=finetuned_model,
    finetuned_tokenizer=finetuned_tokenizer,
    test_samples=test_data,
    instruction=INSTRUCTION,
    args=args,
    num_examples=5  # Show 5 examples
)

print("\n Qualitative comparison complete!")
print("Check 'evaluation_results/' folder for:")
print("  - qualitative_comparison_example_*.png (visual comparisons)")
print("  - qualitative_results.json (structured data)")
print("  - qualitative_results.txt (readable text format)")


STEP 4: Qualitative Comparison on Sample Images
==((====))==  Unsloth 2025.10.1: Fast Mllama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

==((====))==  Unsloth 2025.10.1: Fast Mllama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 