In [1]:
import json
import os
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
import pandas as pd

In [2]:
os.environ['HF_HOME'] = '/hpi/fs00/scratch/liudvikas.zekas/.cache'

In [1]:
pwd

'/hpi/fs00/home/liudvikas.zekas'

In [4]:
system_message = """You are a Vision Language Model specialized in detecting distances to objects in images.
Your task is to analyze the provided image and respond to distance-related queries with concise answers, typically a single number or short phrase.
Focus on delivering precise, accurate distances based on the visual information. Avoid any additional explanation unless absolutely necessary."""

In [47]:


def format_sample(sample, system_message):
    """
    Convert a single sample from the original format to the new format.
    """
    
    # Extract the user query and assistant answer from the conversations list
    # (assuming the structure remains consistent)
    query = ""
    answer = ""
    for turn in sample["conversations"]:
        if turn["from"] == "human":
            query = turn["value"]
        elif turn["from"] == "gpt":
            answer = turn["value"]
    
    # Build the new structure
    new_format = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": query,
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": answer}],
        },
    ]
    
    return new_format


def convert_dataset(input_path, output_path, system_message):
    """
    Reads a JSON file (list of samples), converts each sample to the new format,
    and writes the result as a JSON file.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Convert each sample using 'format_sample'
    converted_data = [format_sample(sample, system_message) for sample in data]
    
    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Write the converted data to the new file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(converted_data, f, ensure_ascii=False, indent=2)


# Paths for your original datasets and the new output folder
train_input = "/hpi/fs00/home/liudvikas.zekas/dataset/merged_train.json"
test_input  = "/hpi/fs00/home/liudvikas.zekas/dataset/merged_test.json"
val_input  = "/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/val.json"

train_output = "/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/train.json"
test_output = "/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/test.json"
val_output  = "/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/val.json"

# Convert and save
convert_dataset(train_input, train_output, system_message)
convert_dataset(test_input, test_output, system_message)
convert_dataset(val_input, val_output, system_message)

print("Conversion complete! Reformatted files are saved in 'dataset_qwen' folder.")


TypeError: list indices must be integers or slices, not str

In [5]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [27]:
model_id = "Qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
train_dataset = pd.read_json("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/train.json")
test_dataset = pd.read_json("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/test.json")
eval_dataset = pd.read_json("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/val.json")

In [7]:
import json

# Read JSON file
with open("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/train.json", "r") as file:
    train_dataset = json.load(file)
with open("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/test.json", "r") as file:
    test_dataset = json.load(file)
with open("/hpi/fs00/scratch/liudvikas.zekas/dataset_qwen/val.json", "r") as file:
    eval_dataset = json.load(file)

In [8]:
train_dataset[1]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a Vision Language Model specialized in detecting distances to objects in images.\nYour task is to analyze the provided image and respond to distance-related queries with concise answers, typically a single number or short phrase.\nFocus on delivering precise, accurate distances based on the visual information. Avoid any additional explanation unless absolutely necessary.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': '/hpi/fs00/scratch/alexandra.kudaeva/street-view-data/output/G340890_50m/K/Z/t/KZtZDxpZWGVvZVm0YP1MYA_back.webp'},
   {'type': 'text',
    'text': 'How far is the bench away from the camera in meters, rounded to the next meter?'}]},
 {'role': 'assistant', 'content': [{'type': 'text', 'text': '22 meters'}]}]

In [9]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [45]:
from qwen_vl_utils import process_vision_info


def generate_text_from_sample(model, processor, sample, max_new_tokens=256, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[1:2], tokenize=False, add_generation_prompt=True  # Use the sample without the system message
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample)

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text
    
output = generate_text_from_sample(model, processor, train_dataset[0])
output

'To estimate the distance of the bench from the camera in meters, we can use the size of the objects in the image as a reference. The bench appears to be about the same size as the cars in the foreground. Assuming the cars are approximately 4 meters long, the bench is likely around 4 meters away from the camera.\n\nTherefore, the bench is approximately 4 meters away from the camera.'

In [34]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 8,293,898,752 || trainable%: 0.0304


In [12]:
from trl import SFTConfig

# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2-7b-instruct-trl-sft-distancevlm",  # Directory to save the model
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=10,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=30,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    fp16=True,  # Use bfloat16 precision
    tf32=False,  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=True,  # Whether to push model to Hugging Face Hub
    report_to="wandb",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset

In [13]:
import wandb

wandb.init(
    project="qwen2-7b-instruct-distance-vlm",  # change this
    name="qwen2-7b-instruct-distance-vlm",  # change this
    config=training_args,
)

[34m[1mwandb[0m: Currently logged in as: [33mliudvikas[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [14]:
# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [
        processor.apply_chat_template(example, tokenize=False) for example in examples
    ]  # Prepare texts for processing
    image_inputs = [process_vision_info(example)[0] for example in examples]  # Process the images to extract inputs

    # Tokenize the texts and process the images
    batch = processor(
        text=texts, images=image_inputs, return_tensors="pt", padding=True
    )  # Encode texts and images into tensors

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()  # Clone input IDs for labels
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels  # Add labels to the batch

    return batch  # Return the prepared batch

In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

[2025-02-03 17:19:42,434] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  trainer = SFTTrainer(


In [17]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,2.7463,2.1164
20,1.537,0.854241
30,0.4878,0.228506
40,0.198,0.161603
50,0.1255,0.053848
60,0.0417,0.032994
70,0.0352,0.031829
80,0.0338,0.030083
90,0.0337,0.030471
100,0.0322,0.029359


Could not locate the best model at qwen2-7b-instruct-trl-sft-distancevlm/checkpoint-140/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=168, training_loss=0.3262629745794194, metrics={'train_runtime': 10090.6292, 'train_samples_per_second': 0.532, 'train_steps_per_second': 0.017, 'total_flos': 2.0735251096872346e+17, 'train_loss': 0.3262629745794194, 'epoch': 3.0})

In [None]:
trainer.save_model(training_args.output_dir)

In [18]:
trainer.save_model("qwen2-7b-instruct-trl-sft-distancevlm")

In [19]:
model.save_pretrained("fine_tuned_qwen2")

SafetensorError: Error while serializing: IoError(Os { code: 122, kind: FilesystemQuotaExceeded, message: "Disk quota exceeded" })

In [35]:
adapter_path = "zekas/qwen2-7b-instruct-trl-sft-distancevlm"
model.load_adapter(adapter_path)

ValueError: Adapter with name default already exists. Please use a different name.

In [46]:
output = generate_text_from_sample(model, processor, train_dataset[13])
output

'To estimate the distance of the bench from the camera, we can use the size of the cars as a reference. Assuming the cars are standard-sized, we can estimate the distance based on their length.\n\n1. The blue car in the foreground is a convertible, which is typically around 4.5 meters long.\n2. The silver car next to it is a sedan, which is also around 4.5 meters long.\n\nGiven that the bench is positioned between these two cars, we can estimate the distance to the bench by considering the length of the cars. If we assume the bench is roughly halfway between the two cars, the distance to the bench would be approximately half the length of the cars.\n\n\\[ \\text{Distance to the bench} \\approx \\frac{4.5 \\text{ meters}}{2} = 2.25 \\text{ meters} \\]\n\nRounded to the next meter, the distance to the bench is approximately 2 meters.'

In [47]:
train_dataset[13]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a Vision Language Model specialized in detecting distances to objects in images.\nYour task is to analyze the provided image and respond to distance-related queries with concise answers, typically a single number or short phrase.\nFocus on delivering precise, accurate distances based on the visual information. Avoid any additional explanation unless absolutely necessary.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': '/hpi/fs00/scratch/alexandra.kudaeva/street-view-data/output/G340890_50m/T/r/O/TrO0dyGjypX9cW4V01Ux3g_left.webp'},
   {'type': 'text',
    'text': 'How far is the bench away from the camera in meters, rounded to the next meter?'}]},
 {'role': 'assistant', 'content': [{'type': 'text', 'text': '14 meters'}]}]