In [None]:
!pip install -q num2words transformers accelerate peft datasets pillow scipy


In [None]:
!pip install num2words



In [None]:
!pip install -q num2words transformers accelerate peft datasets pillow scipy

# Force restart runtime
import os
os.kill(os.getpid(), 9)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ============================================
# TEST MODE - Set to False for real training
# ============================================
TEST_MODE = False  # ← Change to True for quick testing
# ============================================

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Configuration
MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
DATASET_ID = "LouisChen15/ConstructionSite"
OUTPUT_DIR = "/content/drive/MyDrive/smolvlm_construction_finetuned"  # Save to Google Drive
MAX_STEPS = 5 if TEST_MODE else 750  # 750 steps is optimal
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 2 if TEST_MODE else 4
LEARNING_RATE = 2e-4
NUM_TRAIN_EXAMPLES = 8 if TEST_MODE else None
NUM_EVAL_EXAMPLES = 4 if TEST_MODE else None

if TEST_MODE:
    print("="*50)
    print("TEST MODE ENABLED")
    print(f"   Using {NUM_TRAIN_EXAMPLES} train examples")
    print(f"   Using {NUM_EVAL_EXAMPLES} eval examples")
    print(f"   Training for {MAX_STEPS} steps")
    print("="*50)

# Check GPU
if not torch.cuda.is_available():
    print("WARNING: No GPU detected! Change runtime type to GPU.")
else:
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")

print("Loading processor and model...")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load model optimized for CUDA (Colab)
model = Idefics3ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# Prepare model for training with LoRA
model.gradient_checkpointing_enable()

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("Loading dataset...")
train_dataset_raw = load_dataset(DATASET_ID, split="train")
eval_dataset_raw = load_dataset(DATASET_ID, split="test")

print(f"Full dataset - Train: {len(train_dataset_raw)}, Validation: {len(eval_dataset_raw)}")

# TEST MODE or subset selection
if NUM_TRAIN_EXAMPLES:
    train_dataset_raw = train_dataset_raw.select(range(NUM_TRAIN_EXAMPLES))
    print(f"Using only {len(train_dataset_raw)} train examples for testing")

if NUM_EVAL_EXAMPLES:
    eval_dataset_raw = eval_dataset_raw.select(range(NUM_EVAL_EXAMPLES))
    print(f"Using only {len(eval_dataset_raw)} eval examples for testing")

def format_example(example):
    """Convert dataset example to training format"""
    caption = example["image_caption"]

    # Build violation info if exists
    violations = []
    for rule_num in range(1, 5):
        rule_key = f"rule_{rule_num}_violation"
        if example.get(rule_key) and example[rule_key].get("reason"):
            violations.append(f"Rule {rule_num} Violation: {example[rule_key]['reason']}")

    violation_text = " ".join(violations) if violations else "No safety violations detected."

    # Create conversation format
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Describe this construction site image and identify any safety violations."}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": f"Description: {caption}\n\nSafety Assessment: {violation_text}"}
            ]
        }
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=False)

    return {
        "image": example["image"],
        "text": prompt,
    }

print("Formatting training dataset...")
train_dataset = train_dataset_raw.map(
    format_example,
    remove_columns=train_dataset_raw.column_names,
    num_proc=1 if TEST_MODE else 4,
    load_from_cache_file=True,
    desc="Formatting train examples"
)

print("Formatting validation dataset...")
eval_dataset = eval_dataset_raw.map(
    format_example,
    remove_columns=eval_dataset_raw.column_names,
    num_proc=1 if TEST_MODE else 4,
    load_from_cache_file=True,
    desc="Formatting eval examples"
)

def collate_fn(examples):
    """Collate function for batching"""
    # Filter out any None examples
    examples = [ex for ex in examples if ex is not None and "image" in ex and "text" in ex]

    if len(examples) == 0:
        raise ValueError("Empty batch received - all examples were None or invalid")

    try:
        # Each example needs images as a list - wrap each image in a list
        images = [[example["image"].convert("RGB")] for example in examples]
        texts = [example["text"] for example in examples]

        batch = processor(
            images=images,
            text=texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

        # Create labels from input_ids
        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

    except Exception as e:
        print(f"Error in collate_fn: {e}")
        print(f"Batch size: {len(examples)}")
        print(f"First example keys: {examples[0].keys() if examples else 'None'}")
        raise

# Training config with evaluation
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=1 if TEST_MODE else 10,
    save_steps=250,
    save_total_limit=2,
    bf16=False,
    fp16=True,
    gradient_checkpointing=True,
    optim="adamw_torch",
    dataloader_pin_memory=True,
    report_to="none",
    remove_unused_columns=False,
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    # Evaluation settings
    eval_strategy="steps",
    eval_steps=2 if TEST_MODE else 250,
    per_device_eval_batch_size=BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)

print("Starting training...")
try:
    # Check for existing checkpoint to resume from
    resume_from = None
    if os.path.exists(OUTPUT_DIR):
        checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
        if checkpoints:
            latest = max(checkpoints, key=lambda x: int(x.split("-")[1]))
            resume_from = os.path.join(OUTPUT_DIR, latest)
            print(f"Resuming from {resume_from}")

    trainer.train(resume_from_checkpoint=resume_from)
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed: {e}")
    raise

if not TEST_MODE:
    print("Saving model...")
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR)
    print(f"Training complete! Model saved to {OUTPUT_DIR}")
else:
    print("="*50)
    print("TEST MODE - Model not saved")
    print("   Set TEST_MODE = False for real training")
    print("="*50)

Using GPU: NVIDIA A100-SXM4-80GB
Loading processor and model...


`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.03G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

trainable params: 9,277,440 || all params: 2,256,062,320 || trainable%: 0.4112
Loading dataset...


README.md:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

DatasetNotFoundError: Dataset 'LouisChen15/ConstructionSite' is a gated dataset on the Hub. You must be authenticated to access it.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# ============================================
# TEST MODE - Set to False for real training
# ============================================
TEST_MODE = False  # ← Change to True for quick testing
# ============================================

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Configuration
MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
DATASET_ID = "LouisChen15/ConstructionSite"
OUTPUT_DIR = "/content/drive/MyDrive/smolvlm_construction_finetuned"  # Save to Google Drive
MAX_STEPS = 5 if TEST_MODE else 750  # 750 steps is optimal
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 2 if TEST_MODE else 4
LEARNING_RATE = 2e-4
NUM_TRAIN_EXAMPLES = 8 if TEST_MODE else None
NUM_EVAL_EXAMPLES = 4 if TEST_MODE else None

if TEST_MODE:
    print("="*50)
    print("TEST MODE ENABLED")
    print(f"   Using {NUM_TRAIN_EXAMPLES} train examples")
    print(f"   Using {NUM_EVAL_EXAMPLES} eval examples")
    print(f"   Training for {MAX_STEPS} steps")
    print("="*50)

# Check GPU
if not torch.cuda.is_available():
    print("WARNING: No GPU detected! Change runtime type to GPU.")
else:
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")

print("Loading processor and model...")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load model optimized for CUDA (Colab)
model = Idefics3ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# Prepare model for training with LoRA
model.gradient_checkpointing_enable()

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("Loading dataset...")
train_dataset_raw = load_dataset(DATASET_ID, split="train")
eval_dataset_raw = load_dataset(DATASET_ID, split="test")

print(f"Full dataset - Train: {len(train_dataset_raw)}, Validation: {len(eval_dataset_raw)}")

# TEST MODE or subset selection
if NUM_TRAIN_EXAMPLES:
    train_dataset_raw = train_dataset_raw.select(range(NUM_TRAIN_EXAMPLES))
    print(f"Using only {len(train_dataset_raw)} train examples for testing")

if NUM_EVAL_EXAMPLES:
    eval_dataset_raw = eval_dataset_raw.select(range(NUM_EVAL_EXAMPLES))
    print(f"Using only {len(eval_dataset_raw)} eval examples for testing")

def format_example(example):
    """Convert dataset example to training format"""
    caption = example["image_caption"]

    # Build violation info if exists
    violations = []
    for rule_num in range(1, 5):
        rule_key = f"rule_{rule_num}_violation"
        if example.get(rule_key) and example[rule_key].get("reason"):
            violations.append(f"Rule {rule_num} Violation: {example[rule_key]['reason']}")

    violation_text = " ".join(violations) if violations else "No safety violations detected."

    # Create conversation format
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Describe this construction site image and identify any safety violations."}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": f"Description: {caption}\n\nSafety Assessment: {violation_text}"}
            ]
        }
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=False)

    return {
        "image": example["image"],
        "text": prompt,
    }

print("Formatting training dataset...")
train_dataset = train_dataset_raw.map(
    format_example,
    remove_columns=train_dataset_raw.column_names,
    num_proc=1 if TEST_MODE else 4,
    load_from_cache_file=True,
    desc="Formatting train examples"
)

print("Formatting validation dataset...")
eval_dataset = eval_dataset_raw.map(
    format_example,
    remove_columns=eval_dataset_raw.column_names,
    num_proc=1 if TEST_MODE else 4,
    load_from_cache_file=True,
    desc="Formatting eval examples"
)

def collate_fn(examples):
    """Collate function for batching"""
    # Filter out any None examples
    examples = [ex for ex in examples if ex is not None and "image" in ex and "text" in ex]

    if len(examples) == 0:
        raise ValueError("Empty batch received - all examples were None or invalid")

    try:
        # Each example needs images as a list - wrap each image in a list
        images = [[example["image"].convert("RGB")] for example in examples]
        texts = [example["text"] for example in examples]

        batch = processor(
            images=images,
            text=texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

        # Create labels from input_ids
        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

    except Exception as e:
        print(f"Error in collate_fn: {e}")
        print(f"Batch size: {len(examples)}")
        print(f"First example keys: {examples[0].keys() if examples else 'None'}")
        raise

# Training config with evaluation
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=1 if TEST_MODE else 10,
    save_steps=250,
    save_total_limit=2,
    bf16=False,
    fp16=True,
    gradient_checkpointing=True,
    optim="adamw_torch",
    dataloader_pin_memory=True,
    report_to="none",
    remove_unused_columns=False,
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    # Evaluation settings
    eval_strategy="steps",
    eval_steps=2 if TEST_MODE else 250,
    per_device_eval_batch_size=BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)

print("Starting training...")
try:
    # Check for existing checkpoint to resume from
    resume_from = None
    if os.path.exists(OUTPUT_DIR):
        checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
        if checkpoints:
            latest = max(checkpoints, key=lambda x: int(x.split("-")[1]))
            resume_from = os.path.join(OUTPUT_DIR, latest)
            print(f"Resuming from {resume_from}")

    trainer.train(resume_from_checkpoint=resume_from)
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed: {e}")
    raise

if not TEST_MODE:
    print("Saving model...")
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR)
    print(f"Training complete! Model saved to {OUTPUT_DIR}")
else:
    print("="*50)
    print("TEST MODE - Model not saved")
    print("   Set TEST_MODE = False for real training")
    print("="*50)

Using GPU: NVIDIA A100-SXM4-80GB
Loading processor and model...


You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,277,440 || all params: 2,256,062,320 || trainable%: 0.4112
Loading dataset...


train-00001-of-00002.parquet:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

train-00002-of-00002.parquet:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Full dataset - Train: 7009, Validation: 3004
Formatting training dataset...


Formatting train examples (num_proc=4):   0%|          | 0/7009 [00:00<?, ? examples/s]

Formatting validation dataset...


Formatting eval examples (num_proc=4):   0%|          | 0/3004 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Starting training...


Step,Training Loss,Validation Loss
250,0.0792,0.104684
500,0.0723,0.093255
750,0.0681,0.093012


You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


Training completed successfully!
Saving model...


You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


Training complete! Model saved to /content/drive/MyDrive/smolvlm_construction_finetuned
