In [None]:
!pip install torch transformers einops datasets bitsandbytes accelerate pandas pillow peft

In [None]:
# Full script for fine-tuning Moondream2 on a custom Parquet dataset
# This is adapted from the official Moondream fine-tuning notebook:
# https://github.com/vikhyat/moondream/blob/main/notebooks/Finetuning.ipynb
# Requirements: Run on a machine with GPU, install dependencies via:
# pip install torch transformers einops datasets bitsandbytes accelerate pandas pillow peft

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer,get_scheduler
from torch.optim import AdamW
from datasets import load_dataset  # Not used here, but for reference
from tqdm import tqdm
import pandas as pd
from io import BytesIO
from PIL import Image
from peft import LoraConfig, get_peft_model  # For efficient fine-tuning with LoRA

# Custom Dataset for Parquet file
class ParquetDataset(Dataset):
    def __init__(self, parquet_path):
        self.df = pd.read_parquet(parquet_path)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(BytesIO(row['image'])).convert("RGB")
        question = "Describe this image."  # Adjust if captions are prompts, e.g., "What is the prompt for this image?"
        answer = row['caption']
        return {
            "image": image,
            "qa": [
                {
                    "question": question,
                    "answer": answer,
                }
            ]
        }

# Collate function for batching
def collate_fn(batch):
    images = [item['image'] for item in batch]
    texts = [f"USER: <image>{item['qa'][0]['question']}\nASSISTANT: {item['qa'][0]['answer']}</s>" for item in batch]
    return images, texts

# Load model and tokenizer
model_id = "vikhyatk/moondream2"
revision = "2024-05-20"  # Use latest stable
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    torch_dtype=torch.float16,  # Use bfloat16 if on Ampere+ GPU
    device_map="auto"  # Maps to GPU
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Optional: Use LoRA for parameter-efficient fine-tuning to save VRAM
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["qkv",        # vision attention projection
                   "proj",       # vision output projection
                   "out_proj"],  # Adjust based on model architecture
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)

# Prepare dataset and dataloader
# Assume you have train.parquet; split if needed
dataset = ParquetDataset("/kaggle/input/appron-prompt-dataset/dataset.parquet")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)  # Adjust batch_size based on VRAM

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1  # Increase as needed
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
model.train()
for epoch in range(num_epochs):
    for images, texts in tqdm(dataloader):
        # Tokenize texts
        encodings = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        
        # Encode images
        image_embeds = model.vision_encoder(images)  # Assuming model has vision_encoder; check model docs if needed
        
        # Forward pass (adapt if model uses different input format)
        outputs = model(
            input_ids=encodings.input_ids,
            attention_mask=encodings.attention_mask,
            image_embeds=image_embeds,
            labels=encodings.input_ids  # For causal LM loss
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    
    print(f"Epoch {epoch+1} completed.")

# Save fine-tuned model
model.save_pretrained("kaggle/working/finetuned_moondream")


In [None]:
import inspect
print(inspect.signature(model.forward))

In [None]:

# Testing before and after fine-tuning
# Note: For before, reload the original model without LoRA

# Before fine-tuning (use original model)
original_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    torch_dtype=torch.float16,
    device_map="auto"
)

test_image = Image.open("path/to/your_test_image.jpg").convert("RGB")
enc_image = original_model.encode_image(test_image)
print("Before fine-tuning:")
print(original_model.answer_question(enc_image, "Describe this image.", tokenizer))


In [None]:
# After fine-tuning
finetuned_model = AutoModelForCausalLM.from_pretrained(
    "path/to/save/finetuned_moondream",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

enc_image = finetuned_model.encode_image(test_image)
print("After fine-tuning:")
print(finetuned_model.answer_question(enc_image, "Describe this image.", tokenizer))

In [None]:
# pip install torch transformers einops datasets pandas pillow peft accelerate bitsandbytes

import os
from io import BytesIO
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# ------------------
# 1) Dataset (Parquet)
# ------------------
class ParquetDataset(torch.utils.data.Dataset):
    def __init__(self, parquet_path, image_col_guess=("image_bytes","images","image","image_path"), caption_col_guess=("caption","text","answer","labels")):
        self.df = pd.read_parquet(parquet_path)
        # auto-detect columns
        self.img_col = next((c for c in image_col_guess if c in self.df.columns), None)
        self.cap_col = next((c for c in caption_col_guess if c in self.df.columns), None)
        if self.img_col is None:
            raise ValueError(f"Could not find an image column among {image_col_guess}. Have columns: {list(self.df.columns)}")
        if self.cap_col is None:
            raise ValueError(f"Could not find a caption/answer column among {caption_col_guess}. Have columns: {list(self.df.columns)}")

    def __len__(self):
        return len(self.df)

    def _load_image(self, row):
        val = row[self.img_col]
        # If it's file path
        if isinstance(val, str):
            return Image.open(val).convert("RGB")
        # If it's bytes (Parquet as bytes)
        if isinstance(val, (bytes, bytearray, memoryview)):
            return Image.open(BytesIO(val)).convert("RGB")
        # If it’s already PIL Image (rare)
        if isinstance(val, Image.Image):
            return val.convert("RGB")
        raise TypeError(f"Unsupported image type in column '{self.img_col}': {type(val)}")

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = self._load_image(row)
        answer = str(row[self.cap_col])
        # simple instruction format (Moondream will read image from its methods)
        question = "Describe this image."
        # We’ll train only on assistant answer tokens (mask prompt during loss)
        text = f"USER: <image>{question}\nASSISTANT: {answer}</s>"
        return image, text

def collate_fn(batch):
    images, texts = zip(*batch)
    return list(images), list(texts)

# ------------------
# 2) Load Moondream2
# ------------------
model_id = "vikhyatk/moondream2"
# Use a recent revision for best compatibility & bugfixes in custom code:
revision = "2025-01-09"  # or omit to get latest; docs show this date
# IMPORTANT: Do NOT create AutoProcessor for Moondream. Use the model methods.

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=revision,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
    device_map="auto",
)

# Text tokenizer (no processor!)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
# Moondream’s tokenizer often lacks a pad token; fix the padding error you saw:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ------------------
# 3) LoRA (correct target module for Moondream)
# ------------------
lora_cfg = LoraConfig(
    r=8,               # tune to your VRAM
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["qkv",        # vision attention projection
                   "proj",       # vision output projection
                   "out_proj"],   # <- Moondream-specific
)
model = get_peft_model(model, lora_cfg)

# Optional: gradient checkpointing on the text transformer
if hasattr(model, "text_model") and hasattr(model.text_model, "transformer"):
    try:
        model.text_model.transformer.gradient_checkpointing_enable()
    except Exception:
        pass

# ------------------
# 4) Data
# ------------------
parquet_path = "/kaggle/input/appron-prompt-dataset/dataset.parquet"  # change if needed
dataset = ParquetDataset(parquet_path)
loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)

# ------------------
# 5) Optimizer & training utils
# ------------------
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

def mask_user_tokens(input_ids, labels, user_prefix="USER:"):
    # Simple label masking: ignore loss for the prompt part if you wish.
    # Here we keep it simple and train on full sequence. If you want strict SFT,
    # parse and set labels for USER tokens to -100.
    return labels

model.train()

# ------------------
# 6) Training loop using Moondream’s own image preprocessing
#    We avoid passing 'images' into a tokenizer/processor.
# ------------------
device = next(iter(model.parameters())).device

for epoch in range(1):  # increase as needed
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}")
    for images, texts in pbar:
        # 1) Encode images with Moondream’s built-in preprocessing
        #    (You can pre-encode per image to reuse across steps if wanted.)
        encoded_images = [model.encode_image(img) for img in images]

        # 2) Tokenize text
        tok = tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors="pt",
        )
        input_ids = tok.input_ids.to(device)
        attention_mask = tok.attention_mask.to(device)

        # 3) Build labels (optionally mask the prompt tokens)
        labels = input_ids.clone()
        labels = mask_user_tokens(input_ids, labels)

        # 4) Forward (use Moondream custom forward that accepts images via kwargs)
        #    Moondream’s HF wrapper routes 'images=...' correctly.
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            images=encoded_images,     # <- pass encoded images here
            labels=labels,
        )
        loss = outputs.loss

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        pbar.set_postfix(loss=f"{loss.item():.4f}")

# ------------------
# 7) Save
# ------------------
save_dir = "./finetuned_moondream"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Saved to", save_dir)


In [None]:
!apt-get update && apt-get install -y libvips -q


In [None]:
# pip install torch transformers pandas pillow peft accelerate bitsandbytes

import os
from io import BytesIO
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# 1) Dataset
class ParquetDataset(Dataset):
    def __init__(self, path, img_cols=("image_bytes","images","image","image_path"), cap_cols=("caption","text")):
        df = pd.read_parquet(path)
        self.img_col = next((c for c in img_cols if c in df.columns), None)
        self.cap_col = next((c for c in cap_cols if c in df.columns), None)
        if not self.img_col or not self.cap_col:
            raise ValueError(f"Missing columns. Found: {df.columns.tolist()}")
        self.df = df

    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        val = row[self.img_col]
        img = Image.open(BytesIO(val) if isinstance(val, (bytes, bytearray)) else val).convert("RGB")
        ans = str(row[self.cap_col])
        txt = f"USER: <image>Describe this image.\nASSISTANT: {ans}</s>"
        return img, txt

def collate_fn(batch):
    imgs, txts = zip(*batch)
    return list(imgs), list(txts)

# 2) Model + Tokenizer
model_id = "vikhyatk/moondream2"
revision = "2025-06-21"  # latest with full compatibility
model = AutoModelForCausalLM.from_pretrained(model_id, revision=revision, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 3) Device logic (critical fix) — ensures all parts of the model live on same device
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")
model = model.to(device)  # mandatory fix :contentReference[oaicite:1]{index=1}

# 4) LoRA
lora_cfg = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.05,
                     bias="none", task_type="CAUSAL_LM",
                     target_modules=["qkv","proj","out_proj"])
model = get_peft_model(model, lora_cfg)

# 5) DataLoader setup
path = "/kaggle/input/appron-prompt-dataset/dataset.parquet"
dataset = ParquetDataset(path)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# 6) Optimizer
from torch.optim import AdamW
opt = AdamW(model.parameters(), lr=1e-5)

model.train()
for epoch in range(1):
    for images, texts in tqdm(loader, desc=f"Epoch {epoch+1}"):
        enc_imgs = []
        for img in images:
            enc = model.encode_image(img)  # all internals now on correct device
            # enc.kv_cache = enc.kv_cache.to(device)
            enc_imgs.append(enc)

        tok = tokenizer(texts, padding=True, truncation=True,
                        return_tensors="pt").to(device)
        labels = tok.input_ids.clone()

        outs = model(input_ids=tok.input_ids,
                     attention_mask=tok.attention_mask,
                     images=enc_imgs,
                     labels=labels)
        loss = outs.loss

        opt.zero_grad(set_to_none=True)
        loss.backward()
        opt.step()

        tqdm.write(f"Loss: {loss.item():.4f}")

# 7) Saving
odir = "./finetuned_moondream"
os.makedirs(odir, exist_ok=True)
model.save_pretrained(odir)
tokenizer.save_pretrained(odir)
print("Saved to", odir)


In [None]:
import inspect
real_model = model.get_base_model() if hasattr(model, "get_base_model") else model
print(real_model.__class__)
print(inspect.signature(real_model.forward))


In [2]:
# -------------------------
# 0) Install prerequisites
# -------------------------
# pip install torch transformers peft einops pandas pillow tqdm accelerate bitsandbytes

import os
from io import BytesIO
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# -------------------------
# 1) Dataset
# -------------------------
class ParquetDataset(Dataset):
    def __init__(self, parquet_path, image_col_guess=("image_bytes","images","image","image_path"),
                 caption_col_guess=("caption","text","answer","labels")):
        self.df = pd.read_parquet(parquet_path)
        self.img_col = next((c for c in image_col_guess if c in self.df.columns), None)
        self.cap_col = next((c for c in caption_col_guess if c in self.df.columns), None)
        if self.img_col is None or self.cap_col is None:
            raise ValueError("Cannot find image or caption column")

    def __len__(self):
        return len(self.df)

    def _load_image(self, row):
        val = row[self.img_col]
        if isinstance(val, str):
            return Image.open(val).convert("RGB")
        if isinstance(val, (bytes, bytearray, memoryview)):
            return Image.open(BytesIO(val)).convert("RGB")
        if isinstance(val, Image.Image):
            return val.convert("RGB")
        raise TypeError(f"Unsupported image type: {type(val)}")

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = self._load_image(row)
        answer = str(row[self.cap_col])
        question = "Describe this image."
        text = f"USER: <image>{question}\nASSISTANT: {answer}</s>"
        return image, text

def collate_fn(batch):
    images, texts = zip(*batch)
    return list(images), list(texts)

# -------------------------
# 2) Load Moondream2 + tokenizer
# -------------------------
model_id = "vikhyatk/moondream2"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = next(model.parameters()).device

# -------------------------
# 3) Apply LoRA
# -------------------------
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["qkv", "proj", "out_proj"],
)
model = get_peft_model(model, lora_cfg)

# -------------------------
# 4) Dataset / DataLoader
# -------------------------
parquet_path = "/kaggle/input/appron-prompt-dataset/dataset.parquet"
dataset = ParquetDataset(parquet_path)
loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)

# -------------------------
# 5) Optimizer
# -------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
model.train()

# -------------------------
# 6) Training Loop
# -------------------------
for epoch in range(1):
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}")
    for images, texts in pbar:
        # 1) Encode images (ensure correct device)
        encoded_images = [model.encode_image(img) for img in images]
        for e in encoded_images:
            # Moondream EncodedImage may contain kv_cache on CPU internally
            if hasattr(e, "kv_cache") and e.kv_cache is not None:
                e.kv_cache = e.kv_cache.to(device)

        # 2) Tokenize text
        tok = tokenizer(texts, padding=True, truncation=True, max_length=1024, return_tensors="pt")
        input_ids = tok.input_ids.to(device)
        attention_mask = tok.attention_mask.to(device)
        labels = input_ids.clone()

        # 3) Call Moondream forward using internal methods
        # Forward only accepts **kwargs for internal usage
        # images should be passed via `images=encoded_images`
        forward_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask,
                          "images": encoded_images, "labels": labels}

        # Peft-wrapped model: must use **kwargs forwarding
        outputs = model(**forward_kwargs)
        loss = outputs.loss

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

# -------------------------
# 7) Save
# -------------------------
save_dir = "./finetuned_moondream"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to", save_dir)


Epoch 1:   0%|          | 0/150 [00:03<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!