In [None]:
# ================================
# 1. Install dependencies
# ================================
!pip install -q transformers accelerate datasets peft bitsandbytes trl safetensors

import torch
from torch.utils.data import IterableDataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import itertools

# ================================
# 2. Load streaming datasets (RICO + MobileViews)
# ================================
# RICO Dataset - UI Screenshots and View Hierarchies
rico_dataset1 = load_dataset(
    path="shunk031/Rico",
    name="ui-screenshots-and-view-hierarchies",
    split="train",
    streaming=True
)

# RICO Dataset - With Semantic Annotations
rico_dataset2 = load_dataset(
    path="shunk031/Rico",
    name="ui-screenshots-and-hierarchies-with-semantic-annotations",
    split="train",
    streaming=True
)

# MobileViews Dataset - Screenshots and View Hierarchies
mobileviews_dataset = load_dataset(
    path="mllmTeam/MobileViews",
    data_dir="MobileViews_Screenshots_ViewHierarchies/Parquets",
    split="train",
    streaming=True
)

# Combine all three datasets
combined_dataset = itertools.chain(rico_dataset1, rico_dataset2, mobileviews_dataset)

print("✅ Loaded datasets: RICO (2 splits) + MobileViews")

# ================================
# 3. Load tokenizer
# ================================
model_name = "Qwen/Qwen2.5-14B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# ================================
# 4️. Load base model with 4-bit quant + GPU offload
# ================================
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Memory limits for g5 EC2 instance (32GB RAM, ~24GB GPU)
# Adjusted for 14B model + training overhead
max_memory = {
    "cpu": "28GB",      # Leave 4GB for system
    0: "22GB"           # GPU 0 - g5.xlarge has 24GB VRAM, leave 2GB buffer
}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory=max_memory,
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

# ================================
# 5️. LoRA configuration (optimized for 14B model)
# ================================
lora_config = LoraConfig(
    r=16,                                    # Increased from 8 for larger model
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # More modules for better adaptation
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

# ================================
# 6️. Memory-efficient IterableDataset
# ================================
class StreamingRicoDataset(IterableDataset):
    def __init__(self, dataset_iter, tokenizer, max_length=512):
        self.dataset_iter = dataset_iter
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __iter__(self):
        for example in self.dataset_iter:
            # Convert all fields to a single string safely
            parts = [f"{k}: {str(v)}" for k, v in example.items() if v is not None]
            text = " | ".join(parts)
            tokenized = self.tokenizer(
                text, truncation=True, padding="max_length",
                max_length=self.max_length, return_tensors="pt"
            )
            yield {
                "input_ids": tokenized["input_ids"].squeeze(0),
                "attention_mask": tokenized["attention_mask"].squeeze(0),
                "labels": tokenized["input_ids"].squeeze(0)  # causal LM
            }

train_dataset = StreamingRicoDataset(combined_dataset, tokenizer)

# ================================
# 7️. Data collator
# ================================
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# ================================
# 8️. Training arguments (optimized for g5 EC2)
# ================================
training_args = TrainingArguments(
    output_dir="./qwen2.5_14b_rico_mobileviews_lora",
    per_device_train_batch_size=2,        # Increased from 1 (more GPU memory)
    gradient_accumulation_steps=8,        # Reduced from 16 (effective batch = 16)
    warmup_steps=100,                     # Increased warmup
    max_steps=1500,                       # More steps for additional dataset
    learning_rate=2e-4,                   # Slightly lower for 14B model
    fp16=True,
    logging_steps=10,
    save_steps=250,                       # Save checkpoints
    save_total_limit=3,                   # Keep more checkpoints
    gradient_checkpointing=True,          # Enable for memory efficiency
    optim="paged_adamw_8bit",            # Memory-efficient optimizer
    dataloader_num_workers=4,             # Utilize CPU cores
    report_to="none"
)

# ================================
# 9️. Trainer
# ================================
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args
)

# ================================
# 10. Empty CUDA cache before training
# ================================
torch.cuda.empty_cache()

# ================================
# 11. Fine-tune
# ================================
trainer.train()

# ================================
# 1️2. Save LoRA adapters + tokenizer
# ================================
model.save_pretrained("./qwen2.5_14b_rico_mobileviews_lora")
tokenizer.save_pretrained("./qwen2.5_14b_rico_mobileviews_lora")
print("Fine-tuning complete! LoRA adapters saved to ./qwen2.5_14b_rico_mobileviews_lora")

In [None]:
# ================================
# Optional: Compress and download adapters (for EC2 to local transfer)
# ================================
# Uncomment if you want to compress the output folder
# !tar -czf qwen2.5_14b_rico_mobileviews_lora.tar.gz qwen2.5_14b_rico_mobileviews_lora

# For EC2, you can use SCP or AWS S3 to transfer files:
# Example S3 upload:
# !aws s3 cp qwen2.5_14b_rico_mobileviews_lora s3://your-bucket/models/ --recursive

# Example SCP to local machine:
# scp -i your-key.pem -r ubuntu@ec2-instance:/path/to/qwen2.5_14b_rico_mobileviews_lora ./local-folder/

print("Model saved locally at: ./qwen2.5_14b_rico_mobileviews_lora")
print("Use SCP, S3, or download directly from EC2 to transfer the model.")