In [1]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.49.0
    Uninstalling transformers-4.49.0:
      Successfully uninstalled transformers-4.49.0
Successfully installed transformers-4.50.0


In [2]:
!pip install datasets evaluate qwen-vl-utils sacrebleu bert_score rouge_score trl bitsandbytes peft accelerate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.10-py3-none-any.whl.metadata (6.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from da

In [4]:
img_qapairs = [
    ("./vlm_images/img1.jpg", "Where did I keep the coffee cup?", "The Coffee cup is placed on the top of the sofa."),
    ("./vlm_images/img1.jpg", "Did I turn off the lights in the living room?", "No, you fogot to turn off the lights near the sofa."),
    ("./vlm_images/img2.jpeg", "Where did I keep the water bottle?", "The water bottle is kept on the top of the table."),
    ("./vlm_images/img2.jpeg", "Did I turn off the lights in the living room?", "Yes, the lights are already turned off."),
    ("./vlm_images/img3.jpg", "Did I leave the stove on in the kitchen?", "No, you didn't leave the stove on. It's off only."),
    ("./vlm_images/img3.jpg", "Is the fridge door open?", "No, The kitchen door is properly closed."),
    ("./vlm_images/img4.jpg", "Is the lights off in the kitchen by any chance?", "Yes, you forgot to turn off the lights."),
    # ("./vlm_images/img6.jpg", "Where is rice cooker?", "It's near the kitchen sink"),
    # ("./vlm_images/img7.jpg", "Where is water Jug?", "It's in cupboard in near the fridge"),
    # ("./vlm_images/img8.jpg", "Where is my charger?", "It's on table"),
    # ("./vlm_images/img9.jpg", "Where is cycle?", "Its near the ladder")
]

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')
print(os.getcwd())

/content/drive/MyDrive/Colab Notebooks


In [6]:
SYSTEM_PROMPT = f"""
You are an AI assistant designed to help elderly individuals with daily tasks by analyzing CCTV footage from their homes. Your primary functions include:

1. Object Detection & Localization: Identify and describe the location of household objects (e.g., keys, glasses, remote control) to assist users in finding them.
2. Safety Monitoring: Check for potential hazards, such as determining whether the stove is turned off, doors are left open, or objects are obstructing pathways.
3. Context-Aware Assistance: Provide clear and concise responses tailored to the user’s needs, ensuring easy comprehension.
4. Privacy & Security Compliance: Respond only to authorized users, avoid storing sensitive data, and prioritize user privacy.
5. Error Handling & Transparency: Clearly state uncertainties in detection results and provide alternative suggestions when necessary.

Always prioritize safety, clarity, and accessibility in your responses. Keep explanations simple and actionable for elderly users.
"""

In [7]:
USER_PROMPT = """
You are an AI assistant designed to help elderly individuals by analyzing images from CCTV footage. You will receive an image along with a user’s question about an object or scene. Your task is to:

1. Identify the Query Object: Locate and describe the requested object in the image.
2. Provide Clear Localization: Mention the largest nearby object to help the user find the requested item more easily.
3. Ensure Accuracy & Clarity: If the object is not visible, state it clearly and suggest alternative locations based on common placements.
4. Safety Awareness: If the question pertains to safety (e.g., checking if the stove is off), provide a direct and reliable response with any potential warnings.

Example Format:
User Input: (Image of a kitchen)
'Where are my glasses?'

AI Response:
'Your glasses are on the dining table, near the large fruit bowl.'

Ensure your responses are simple, clear, and useful for elderly users. If unsure, express uncertainty and provide reasonable suggestions.

Question:
{question}
"""

In [8]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image


class ImageDescriptionDataset(Dataset):
    def __init__(self, qa_list, system_prompt, user_prompt, processor):

        self.qa_list = qa_list
        self.processor = processor
        self.user_prompt = user_prompt
        self.system_prompt = system_prompt

    def __len__(self):
        return len(self.qa_list)

    def __getitem__(self, idx):
        img_path, question, response = self.qa_list[idx]

        chat_template = [
            {
                "role": "system",
                "content": {"type": "text", "text": self.system_prompt}
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img_path},
                    {"type": "text", "text": self.user_prompt.format(question=question)}
                ]
            },
            {
                "role": "assistant",
                "content": {"type": "text", "text": response}
            }
        ]

        return chat_template

def collate_fn(batch):
    """Custom collate function to process a batch of examples."""
    input_ids = []
    all_images = []

    for example in batch:
        inputs_text = processor.apply_chat_template(
            example,
            tokenize=False,
            add_generation_prompt=False
        )

        images, _ = process_vision_info(example)

        if images is not None:
            images = [image.resize((224, 224)) for image in images]

        input_ids.append(inputs_text)
        all_images.extend(images)

    model_inputs = processor(
        text=input_ids,
        images=all_images,
        padding=True,
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    ).to("cuda")

    labels = model_inputs["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    model_inputs["labels"] = labels

    return model_inputs

In [9]:
from transformers import BitsAndBytesConfig


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "Qwen/Qwen2-VL-2B-Instruct"
processor = Qwen2VLProcessor.from_pretrained(model_id)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)


dataset = ImageDescriptionDataset(
        qa_list=img_qapairs,
        system_prompt=SYSTEM_PROMPT,
        user_prompt=USER_PROMPT,
        processor=processor
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

In [10]:
from torch.utils.data import random_split

train_dataset, eval_dataset = random_split(dataset, [0.9, 0.1], generator=torch.Generator().manual_seed(42))



In [11]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

In [12]:
from peft import get_peft_model

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 2,210,075,136 || trainable%: 0.0493


In [13]:
from trl import SFTConfig, SFTTrainer
from transformers import Qwen2VLProcessor

args = SFTConfig(
    output_dir="./qwen2.5VL2B-voxel",
    num_train_epochs=2,  # Train for 3 epochs
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",

    # Direct console output settings
    logging_steps=1,
    logging_strategy="steps",
    logging_first_step=True,

    # Enable TensorBoard
    report_to="tensorboard",  # Changed from "none" to "tensorboard"

    # Evaluation and saving strategies
    evaluation_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,

    learning_rate=2e-4,
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataset_kwargs={"skip_prepare_dataset": True},

    dataloader_pin_memory=False,
    remove_unused_columns=False
)



In [14]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

  trainer = SFTTrainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model(args.output_dir)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,2.7833,2.733627
2,2.5211,2.433869


In [16]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image


class ImageDescriptionDatasetInference(Dataset):
    def __init__(self, qa_list, system_prompt, user_prompt, processor):

        self.qa_list = qa_list

        self.processor = processor
        self.user_prompt = user_prompt
        self.system_prompt = system_prompt

    def __len__(self):
        return len(self.qa_list)

    def __getitem__(self, idx):
        img_path, question, response = self.qa_list[idx]

        chat_template = [
            {
                "role": "system",
                "content": {"type": "text", "text": self.system_prompt}
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img_path},
                    {"type": "text", "text": self.user_prompt.format(question=question)}
                ]
            }
        ]

        return chat_template

def collate_fn(batch):
    """Custom collate function to process a batch of examples."""
    input_ids = []
    all_images = []

    for example in batch:
        inputs_text = processor.apply_chat_template(
            example,
            tokenize=False,
            add_generation_prompt=True
        )

        images, _ = process_vision_info(example)

        if images is not None:
            images = [image.resize((224, 224)) for image in images]

        input_ids.append(inputs_text)
        all_images.extend(images)

    model_inputs = processor(
        text=input_ids,
        images=all_images,
        padding=True,
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    ).to("cuda")

    labels = model_inputs["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    model_inputs["labels"] = labels

    return model_inputs

In [17]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"
processor = Qwen2VLProcessor.from_pretrained(model_id)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

adapter_path = "qwen2.5VL2B-voxel"
model.load_adapter(adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
def create_dataloader(
        processor, qa_list, system_prompt, user_prompt,
        batch_size=1, shuffle=True
):

    dataset = ImageDescriptionDatasetInference(
        qa_list=qa_list,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        processor=processor
    )

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn
    )

    return dataloader

In [19]:
dataloader = create_dataloader(processor, img_qapairs,
                               SYSTEM_PROMPT, USER_PROMPT,
                               batch_size=8, shuffle=True)

In [20]:
from tqdm import tqdm

generated_reports = []

for model_inputs in tqdm(dataloader, desc="Running Inference"):
    generated_ids = model.generate(**model_inputs, max_new_tokens=1024)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    generated_reports.append(output_text)

Running Inference: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


In [21]:
new_generated_reports = []

for report in generated_reports:
    new_generated_reports.extend(report)

In [22]:
new_generated_reports

["'Your coffee cup is on the dining table, near the large fruit bowl.'",
 'No, you did not leave the stove on.',
 'No, you did not turn off the lights in the living room. The lights are still on.',
 'No, the lights are still on in the living room.',
 "'Your water bottle is on the dining table, near the large fruit bowl.'"]