In [1]:
# Install required libraries
!pip install -q --upgrade pip
!pip install -q transformers accelerate peft bitsandbytes datasets pillow tqdm torch torchvision torchaudio huggingface_hub

# Clear GPU cache after installs
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n✅ Dependencies installed!")


✅ Dependencies installed!


In [2]:
import torch
import os

# Check GPU
if torch.cuda.is_available():
    print(f"✅ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️  No GPU detected - training will be very slow!")

# Check PyTorch and CUDA versions
print(f"\n🐍 Python: {os.sys.version}")
print(f"🔥 PyTorch: {torch.__version__}")
try:
    print(f"💾 CUDA: {torch.version.cuda}")
    print(f"⚡ cuDNN: {torch.backends.cudnn.version()}")
except Exception as e:
    print(f"Could not retrieve CUDA/cuDNN version: {e}")

✅ GPU detected: NVIDIA L4
   VRAM: 23.67 GB

🐍 Python: 3.12.7 | packaged by conda-forge | (main, Oct  4 2024, 16:05:46) [GCC 13.3.0]
🔥 PyTorch: 2.9.0+cu128
💾 CUDA: 12.8
⚡ cuDNN: 91002


In [None]:
from pathlib import Path

# --- Configuration ---

# Dataset (Using local folder)
DATA_FOLDER = Path("./training_data_auto") # <-- Your local folder name

# Model
MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct" # <-- 2B model

# Training Output
OUTPUT_DIR = Path("./ui_model_2B_finetuned_qlora")  
MERGED_OUTPUT_DIR = Path("./ui_model_2B_merged") 

# L4 24GB Optimized Settings - FASTER training with smart text limiting
BATCH_SIZE = 2                  # Can use 2 now with shorter text
GRADIENT_ACCUMULATION_STEPS = 8 # Back to 8 for effective batch of 16
NUM_EPOCHS = 2                  
LEARNING_RATE = 3e-4            
# Limit response text length but keep full image
MAX_ELEMENT_TEXT_CHARS = 500    # Limit text description to ~500 chars (keeps images intact)
MAX_SEQ_LENGTH = 8192           # Reasonable max with limited text

# Other Settings
MAX_SAMPLES = None              # Use None for the full dataset
SAVE_STEPS = 500                
LOGGING_STEPS = 50              

# --- End Configuration ---

# Create directories
OUTPUT_DIR.mkdir(exist_ok=True)
MERGED_OUTPUT_DIR.mkdir(exist_ok=True)


print("⚙️  Configuration (L4 24GB Optimized for 2B Model - SPEED MODE):")
print(f"   Dataset: Local folder '{DATA_FOLDER}'")
print(f"   Model: {MODEL_NAME}")
print(f"   Output (Adapter): {OUTPUT_DIR}")
print(f"   Batch size: {BATCH_SIZE} (Effective: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f"   Max text chars: {MAX_ELEMENT_TEXT_CHARS} (keeps image, limits description)")
print(f"   Max seq length: {MAX_SEQ_LENGTH} tokens")
print(f"   ⚡ SPEED MODE: ~2-3x faster with smarter text limiting!")

⚙️  Configuration (L4 24GB Optimized for 2B Model):
   Dataset: Local folder 'training_data_auto'
   Model: Qwen/Qwen2-VL-2B-Instruct
   Output (Adapter): ui_model_2B_finetuned_qlora
   Batch size: 4 (Effective: 16)
   Max Length: 8192 tokens


In [None]:
from pathlib import Path
from transformers import AutoProcessor
import torch
import json
from PIL import Image
from datasets import Dataset
import os

# --- 1. DEFINE PATHS ---
json_path = DATA_FOLDER / "dataset.json"
IMAGE_BASE_PATH = DATA_FOLDER

if not json_path.exists():
    print(f"❌ ERROR: Cannot find 'dataset.json' at {json_path}")
    raise FileNotFoundError(f"Missing {json_path}")
else:
    print(f"✅ Found 'dataset.json' at: {json_path}")
    print(f"✅ Image base directory set to: {IMAGE_BASE_PATH}")

# --- 2. Load Local JSON ---
print(f"🔄 Loading data from '{json_path}'...")
with open(json_path, 'r') as f:
    local_data = json.load(f)

if 'samples' not in local_data:
     print(f"❌ ERROR: 'samples' key not found in {json_path}.")
     raise KeyError("'samples' key not found in dataset.json")

data_list = local_data['samples']
print(f"   Found {len(data_list):,} samples in JSON.")

# --- 3. Convert to Hugging Face Dataset object (metadata only) ---
# Keep only lightweight fields and transform on-the-fly to avoid RAM blowup
raw_dataset = Dataset.from_list([
    {"screenshot": s.get("screenshot"), "elements": s.get("elements", [])}
    for s in data_list
])

if MAX_SAMPLES is not None:
    raw_dataset = raw_dataset.select(range(min(MAX_SAMPLES, len(raw_dataset))))
    print(f"\n✂️ Using a subset of {len(raw_dataset):,} samples for training.")

split_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"\n✅ Train samples: {len(train_dataset):,}")
print(f"✅ Validation samples: {len(val_dataset):,}")

# --- 4. Load Processor ---
print("\n🔄 Loading processor...")
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
print("✅ Processor loaded.")

# --- 5. On-the-fly transform to prevent materializing tensors ---
_WARN_STATE = {"miss": 0}

def _find_subsequence(seq, subseq):
    if not subseq:
        return -1
    n, m = len(seq), len(subseq)
    if m > n:
        return -1
    for i in range(0, n - m + 1):
        if seq[i:i + m] == subseq:
            return i
    return -1


def _normalize_elements(element_data):
    """Flatten and keep only dict-like UI element entries."""
    out = []
    if isinstance(element_data, dict):
        out.append(element_data)
    elif isinstance(element_data, list):
        for item in element_data:
            if isinstance(item, dict):
                out.append(item)
            elif isinstance(item, list):
                for sub in item:
                    if isinstance(sub, dict):
                        out.append(sub)
            # ignore other types
    return out


def _process_one_sample(image_filename, raw_elements):
    """Process a single sample (not batched)."""
    image_path = IMAGE_BASE_PATH / image_filename

    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        if _WARN_STATE["miss"] < 5:
            print(f"Warning: Skipping sample. Could not load image {image_path}: {e}")
        return None
    
    elements = _normalize_elements(raw_elements)

    if elements:
        instruction = "Describe the UI elements visible in this Windows interface."
        # Build element description with character limit for speed
        element_count = len(elements)
        preview = []
        char_count = 0
        for e in elements[:20]:  # Check up to 20 elements
            etype = e.get('type', 'Control') if isinstance(e, dict) else 'Control'
            ename = e.get('name', '') if isinstance(e, dict) else ''
            elem_str = f"{etype} '{ename}'"
            if char_count + len(elem_str) + 2 > MAX_ELEMENT_TEXT_CHARS:
                break  # Stop if we'd exceed limit
            preview.append(elem_str)
            char_count += len(elem_str) + 2  # +2 for "; "
        
        element_list_str = "; ".join(preview)
        if len(preview) < element_count:
            response = f"This interface contains {element_count} UI elements including: {element_list_str}..."
        else:
            response = f"This interface contains {element_count} UI elements. Key elements: {element_list_str}."
    else:
        instruction = "What do you see in this Windows interface?"
        response = "This is a Windows interface screenshot showing UI controls."

    messages = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
        {"role": "assistant", "content": [{"type": "text", "text": response}]}
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

    # Process with smart truncation - images preserved, text may be cut
    inputs = processor(
        text=[text],
        images=[image],
        return_tensors="pt",
        padding=False,
        truncation=True,
        max_length=MAX_SEQ_LENGTH
    )
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}

    labels = inputs["input_ids"].clone()
    resp_ids = processor.tokenizer(response, add_special_tokens=False)["input_ids"]
    seq_ids = labels.tolist()
    start_idx = _find_subsequence(seq_ids, resp_ids) if resp_ids else -1
    if start_idx != -1:
        labels[:start_idx] = -100
    else:
        cut = int(labels.shape[0] * 0.7)
        labels[:cut] = -100
        _WARN_STATE["miss"] += 1
        if _WARN_STATE["miss"] <= 5 or _WARN_STATE["miss"] % 500 == 0:
            print("Warning: Could not locate assistant response tokens reliably. Applying fallback mask.")

    inputs["labels"] = labels
    return inputs


def _transform(examples):
    """Transform function that handles batched data from set_transform."""
    # Check if batched (dict with lists) or single example (dict with single values)
    is_batched = isinstance(examples.get('screenshot'), list)
    
    if is_batched:
        # Process each sample in the batch
        results = []
        for i in range(len(examples['screenshot'])):
            result = _process_one_sample(
                examples['screenshot'][i],
                examples['elements'][i]
            )
            if result is not None:
                results.append(result)
            else:
                # Return empty tensor for failed samples
                results.append({"input_ids": torch.tensor([], dtype=torch.long)})
        
        # Return batched results
        if not results:
            return {"input_ids": [torch.tensor([], dtype=torch.long)]}
        
        # Collate the results into batch format
        batch = {key: [r[key] for r in results] for key in results[0].keys()}
        return batch
    else:
        # Single example
        result = _process_one_sample(examples['screenshot'], examples['elements'])
        if result is None:
            return {"input_ids": torch.tensor([], dtype=torch.long)}
        return result

train_dataset.set_transform(_transform)
val_dataset.set_transform(_transform)

print("✅ Using on-the-fly transforms to minimize RAM usage (no precomputed tensors).")
print(f"⚡ SPEED MODE: Text limited to ~{MAX_ELEMENT_TEXT_CHARS} chars, sequences to {MAX_SEQ_LENGTH} tokens.")

✅ Found 'dataset.json' at: training_data_auto/dataset.json
✅ Image base directory set to: training_data_auto
🔄 Loading data from 'training_data_auto/dataset.json'...
   Found 5,114 samples in JSON.

✅ Train samples: 4,602
✅ Validation samples: 512

🔄 Loading processor...
✅ Processor loaded.

🔄 Applying preprocessing function to datasets...
   Using batched=False (This will take a while but is RAM-safe)...


Map:   0%|          | 0/4602 [00:00<?, ? examples/s]



KeyboardInterrupt: 

In [None]:
from transformers import Qwen2VLForConditionalGeneration, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

print(f"📥 Loading base model {MODEL_NAME} with 4-bit quantization...\n")

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load base model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto", # Automatically place layers on GPU
    trust_remote_code=True
)

# Processor is already loaded

# Prepare model for K-bit training
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM" 
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print parameter summary
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\n✅ Model prepared for QLoRA training!")
print(f"📊 Trainable LoRA parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}% of total)")

# Check memory usage
if torch.cuda.is_available():
    print(f"💾 Initial GPU Memory Used: {torch.cuda.memory_allocated()/1e9:.2f} GB")

In [None]:
from transformers import TrainingArguments

print("⚙️ Setting up Training Arguments...")

# Training Arguments (L4 24GB / 2B Model - SPEED optimized)
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE, # 2
    per_device_eval_batch_size=BATCH_SIZE,  # 2
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # 8
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine", 
    warmup_ratio=0.03, 
    logging_steps=LOGGING_STEPS,
    save_strategy="steps", 
    save_steps=SAVE_STEPS,
    eval_strategy="steps", 
    eval_steps=SAVE_STEPS, 
    save_total_limit=2, 
    load_best_model_at_end=True, 
    metric_for_best_model="eval_loss", 
    greater_is_better=False, 
    fp16=torch.cuda.is_available(), 
    optim="paged_adamw_8bit", 
    gradient_checkpointing=True,  # Keep enabled for safety
    gradient_checkpointing_kwargs={"use_reentrant": False},
    report_to="none", 
    remove_unused_columns=False, 
    dataloader_num_workers=0, 
    dataloader_pin_memory=True,
)

# Collator: handles lists of tensors from batched transform
from typing import List

def custom_collator(features: List[dict]):
    """Collate features that may contain lists of tensors or single tensors."""
    # Flatten if features contain lists (from batched transform)
    flattened = []
    for feat in features:
        if isinstance(feat.get("input_ids"), list):
            # Batched transform output: expand the nested batch
            for i in range(len(feat["input_ids"])):
                sample = {k: v[i] if isinstance(v, list) else v for k, v in feat.items()}
                if sample.get("input_ids") is not None and sample["input_ids"].numel() > 0:
                    flattened.append(sample)
        else:
            # Single sample
            if feat.get("input_ids") is not None and feat["input_ids"].numel() > 0:
                flattened.append(feat)
    
    if len(flattened) == 0:
        return {
            "input_ids": torch.zeros((0,1), dtype=torch.long),
            "attention_mask": torch.zeros((0,1), dtype=torch.long),
            "pixel_values": torch.zeros((0,3,1,1), dtype=torch.float32),
            "labels": torch.zeros((0,1), dtype=torch.long),
        }

    input_ids = [f["input_ids"] for f in flattened]
    labels = [f["labels"] for f in flattened]
    attention_mask = [f.get("attention_mask") for f in flattened]
    pixel_values = [f["pixel_values"] for f in flattened]
    
    # Handle image_grid_thw if present (Qwen2-VL specific)
    # Each sample has shape (1, 3) -> stack to (batch_size, 3)
    image_grid_thw = [f["image_grid_thw"] for f in flattened if "image_grid_thw" in f]

    # Pad ids and labels using tokenizer
    padded_inputs = processor.tokenizer.pad(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        padding="longest",
        return_tensors="pt",
    )

    padded_labels = processor.tokenizer.pad(
        {"input_ids": labels},
        padding="longest",
        return_tensors="pt",
    ).input_ids

    pad_token_id = processor.tokenizer.pad_token_id
    padded_labels[padded_labels == pad_token_id] = -100

    # Stack pixel_values
    pixel_values = torch.stack(pixel_values)
    
    batch = {
        "input_ids": padded_inputs.input_ids,
        "attention_mask": padded_inputs.attention_mask,
        "pixel_values": pixel_values,
        "labels": padded_labels,
    }
    
    # Add image_grid_thw if present - stack along batch dimension
    if image_grid_thw:
        # Each element is (1, 3) or (3,) -> ensure consistent shape and stack
        grids = []
        for grid in image_grid_thw:
            if grid.ndim == 1:
                # Shape (3,) -> add batch dim -> (1, 3)
                grid = grid.unsqueeze(0)
            grids.append(grid)
        # Stack to (batch_size, 3)
        batch["image_grid_thw"] = torch.cat(grids, dim=0)

    return batch

print("✅ Training arguments set!")
print(f"   Saving checkpoints to: {OUTPUT_DIR}")
print(f"   Saving/Evaluating every {SAVE_STEPS} steps")
print(f"   Gradient Checkpointing: {training_args.gradient_checkpointing}")
print(f"   ⚡ Batch={BATCH_SIZE}, GradAccum={GRADIENT_ACCUMULATION_STEPS} → ~2-3x faster!")

if torch.cuda.is_available():
    torch.cuda.empty_cache() 
    print(f"💾 GPU Memory before training: {torch.cuda.memory_allocated()/1e9:.2f} GB")

In [None]:
from transformers import Trainer
import time

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # use datasets with set_transform
    eval_dataset=val_dataset,
    data_collator=custom_collator,
    processing_class=processor.tokenizer,  # use processing_class instead of deprecated tokenizer
)

print("\n" + "="*70)
print(" "*20 + "🚀 STARTING FINE-TUNING (2B Model)")
print("="*70 + "\n")

start_time = time.time()

try:
    print("🏋️‍♂️ Training...")
    train_result = trainer.train()

    elapsed = time.time() - start_time
    print(f"\n✅ Training complete!")
    print(f"   Total Time: {elapsed/3600:.2f} hours ({elapsed/60:.1f} minutes)")

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print("   Metrics saved.")

except Exception as e:
    print(f"❌ Training failed: {e}")
    import traceback
    traceback.print_exc()
finally:
     # Clean up GPU memory
     if torch.cuda.is_available():
          del model
          del trainer
          torch.cuda.empty_cache()
          print(f"   GPU Memory Cleared. Final Usage: {torch.cuda.memory_allocated()/1e9:.2f} GB")

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from peft import PeftModel

print("\n💾 Saving final model artifacts...")
print(f"   The best model checkpoint (LoRA adapter) was saved by the trainer inside: {OUTPUT_DIR}")

# --- Save the processor ---
try:
    if 'processor' not in locals():
        processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
    
    best_checkpoint_path = trainer.state.best_model_checkpoint
    if best_checkpoint_path:
        print(f"   Saving processor to best checkpoint: {best_checkpoint_path}")
        processor.save_pretrained(best_checkpoint_path)
    else:
        print("   Could not determine best checkpoint. Saving processor to main output dir.")
        processor.save_pretrained(str(OUTPUT_DIR))
        
except Exception as e:
    print(f"   Could not save processor: {e}")

# --- Optional: Merge LoRA weights with base model ---
merge_model = False # Set to True to try merging

if merge_model:
    print("\n🔄 Merging LoRA weights into the base model...")
    
    try:
         if torch.cuda.is_available():
              torch.cuda.empty_cache()
              
         base_model = Qwen2VLForConditionalGeneration.from_pretrained(
              MODEL_NAME,
              device_map="auto",
              torch_dtype=torch.float16, 
              trust_remote_code=True
         )
         
         adapter_path = trainer.state.best_model_checkpoint
         if adapter_path is None:
              raise ValueError("Could not find best model checkpoint to merge.")
              
         print(f"   Loading adapter from: {adapter_path}")
         merged_model = PeftModel.from_pretrained(base_model, adapter_path)
         
         print("   Merging...")
         merged_model = merged_model.merge_and_unload()
         print("   Merging complete.")

         MERGED_OUTPUT_DIR.mkdir(exist_ok=True)
         merged_model.save_pretrained(str(MERGED_OUTPUT_DIR))
         processor.save_pretrained(str(MERGED_OUTPUT_DIR))
         print(f"✅ Merged model saved to: {MERGED_OUTPUT_DIR}")

         del base_model
         del merged_model
         if torch.cuda.is_available():
              torch.cuda.empty_cache()

    except Exception as e:
         print(f"❌ Failed to merge model: {e}")
         print("   Skipping merge. You can still use the LoRA adapter from the checkpoint directory.")

print("\n🎉 FINE-TUNING PROCESS COMPLETE!")