In [2]:
%env HF_HOME=/hpi/fs00/scratch/liudvikas.zekas/.cache

env: HF_HOME=/hpi/fs00/scratch/liudvikas.zekas/.cache


In [3]:
import os
import re
import json
from PIL import Image
import torch
import pandas as pd
import torch.nn as nn
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, LlavaForConditionalGeneration

def extract_meter(text):
    """
    Extracts the first occurrence of a number (optionally followed by 'meters', 'meter', or 'm')
    from the given text and returns it as a float.
    """
    # This regex matches a number (integer or decimal) and ignores an optional "meters", "meter" or "m"
    match = re.search(r"(\d+(?:\.\d+)?)\s*(?:meters?|m)?", text, re.IGNORECASE)
    if match:
        try:
            value = float(match.group(1))
            return value
        except Exception as e:
            print("Error converting extracted value:", e)
            return None
    return None

def inference_image(model, processor, image_path, human_text):
    """
    Run inference on a single image.
    
    Args:
        model: The Llava model.
        processor: The processor for formatting the prompt and image.
        image_path (str): Path to the image file.
        human_text (str): The human prompt text from the test JSON.
    
    Returns:
        output (str): The generated answer from the model.
    """
    # Build the conversation prompt including text and an image placeholder.
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": human_text},
                {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    # Load the image in RGB mode.
    image = Image.open(image_path).convert("RGB")
    
    # Process the text prompt and image (wrap image in a list).
    inputs = processor(text=prompt, images=image, padding=True, return_tensors="pt").to(model.device)
    
    # Generate the model's output.
    output = model.generate(**inputs, max_new_tokens=512, do_sample=False)

    return processor.decode(output[0], skip_special_tokens=True)

def run_inference_on_dataset(model, processor, dataset_path, image_root, save_name):
    """
    Iterate over all items in the test JSON file, perform inference, extract the predicted meter,
    compute the error compared to the ground truth, and save the results in a DataFrame and CSV.
    
    Args:
        model: The Llava model.
        processor: The Llava processor.
        dataset_path (str): Path to the JSON file (e.g., "./dataset/test.json").
        image_root (str): Directory where the image files are stored.
        save_name (str): Name for the output CSV file (without extension).
    
    Returns:
        df: A pandas DataFrame with columns: id, predicted_meter, ground_truth_meter, and error.
    """
    results = []
    
    with open(dataset_path, "r") as f:
        data = json.load(f)
    
    for item in data:
        item_id = item.get("id")
        image_file = item["image"]
        image_path = os.path.join(image_root, image_file)
        
        # Extract the human prompt (first "human" message).
        human_text = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "human":
                human_text = conv.get("value", "")
                break
        
        # Remove the "<image>" part from the beginning, if present.
        if human_text.startswith("<image>"):
            human_text = human_text[len("<image>"):].strip()
        
        # Extract the ground truth answer (first "gpt" message).
        ground_truth_str = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "gpt":
                ground_truth_str = conv.get("value", "")
                break
        
        # Run inference.
        model_output = inference_image(model, processor, image_path, human_text)
        print(model_output)
        
        # Extract predicted meter from model output and ground truth meter.
        pred_meter = extract_meter(model_output)
        gt_meter = extract_meter(ground_truth_str)
        
        # Compute the absolute error if both numbers are available.
        error = None
        if pred_meter is not None and gt_meter is not None:
            error = abs(pred_meter - gt_meter)
        
        # Print the results.
        print(f"Item ID: {item_id}")
        print("Predicted Meter:", pred_meter)
        print("Ground Truth:", gt_meter)
        print("Error:", error)
        print("=" * 50)
        
        # Save results.
        results.append({
            "id": item_id,
            "predicted_meter": pred_meter,
            "ground_truth_meter": gt_meter,
            "error": error
        })
    
    # Create a DataFrame from the results and save to CSV.
    df = pd.DataFrame(results)
    csv_filename = f"res/{save_name}.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
    
    return df


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import re
import json
from PIL import Image
import torch
import pandas as pd
import torch.nn as nn
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, LlavaForConditionalGeneration

def extract_meter(text):
    """
    Extracts the first occurrence of a number (optionally followed by 'meters', 'meter', or 'm')
    from the given text and returns it as a float.
    """
    # This regex matches a number (integer or decimal) and ignores an optional "meters", "meter" or "m"
    match = re.search(r"(\d+(?:\.\d+)?)\s*(?:meters?|m)?", text, re.IGNORECASE)
    if match:
        try:
            value = float(match.group(1))
            return value
        except Exception as e:
            print("Error converting extracted value:", e)
            return None
    return None

def inference_image(model, processor, image_path, human_text):
    """
    Run inference on a single image.
    
    Args:
        model: The Llava model.
        processor: The processor for formatting the prompt and image.
        image_path (str): Path to the image file.
        human_text (str): The human prompt text from the test JSON.
    
    Returns:
        output (str): The generated answer from the model.
    """
    # Build the conversation prompt including text and an image placeholder.
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": human_text},
                {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    # Load the image in RGB mode.
    image = Image.open(image_path).convert("RGB")
    
    # Process the text prompt and image (wrap image in a list).
    inputs = processor(text=prompt, images=image, padding=True, return_tensors="pt").to(model.device)
    
    # Generate the model's output.
    #output = model.generate(**inputs, max_new_tokens=512, do_sample=False)
    #print(processor.decode(output[0], skip_special_tokens=True))

    # Run the forward pass to get regression output.
    with torch.no_grad():
        output = model(**inputs)
    # Assuming the output is a tensor of shape (batch_size, 1), convert it to a float.
    predicted_meter = output.item()
    return predicted_meter

def run_inference_on_dataset(model, processor, dataset_path, image_root, save_name):
    """
    Iterate over all items in the test JSON file, perform inference, compare the predicted meter 
    against the ground truth, and save the results in a DataFrame and CSV.
    
    Args:
        model: The Llava regression model.
        processor: The processor.
        dataset_path (str): Path to the JSON file (e.g., "./dataset/test.json").
        image_root (str): Directory where the image files are stored.
        save_name (str): Name for the output CSV file (without extension).
    
    Returns:
        df: A pandas DataFrame with columns: id, predicted_meter, ground_truth_meter, and error.
    """
    results = []
    
    with open(dataset_path, "r") as f:
        data = json.load(f)
    
    for item in data:
        item_id = item.get("id")
        image_file = item["image"]
        image_path = os.path.join(image_root, image_file)
        
        # Extract the human prompt (first "human" message).
        human_text = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "human":
                human_text = conv.get("value", "")
                break
        
        # Remove the "<image>" part from the beginning, if present.
        if human_text.startswith("<image>"):
            human_text = human_text[len("<image>"):].strip()
        
        # Extract the ground truth answer (first "gpt" message).
        ground_truth_str = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "gpt":
                ground_truth_str = conv.get("value", "")
                break
        
        # Run inference using the regression head.
        predicted_meter = inference_image(model, processor, image_path, human_text)
        print(f"Predicted meter: {predicted_meter}")
        
        # Extract ground truth meter from the ground truth text.
        gt_meter = extract_meter(ground_truth_str)
        
        # Compute the absolute error if both numbers are available.
        error = None
        if predicted_meter is not None and gt_meter is not None:
            error = abs(predicted_meter - gt_meter)
        
        # Print the results.
        print(f"Item ID: {item_id}")
        print("Predicted Meter:", predicted_meter)
        print("Ground Truth:", gt_meter)
        print("Error:", error)
        print("=" * 50)
        
        # Save results.
        results.append({
            "id": item_id,
            "predicted_meter": predicted_meter,
            "ground_truth_meter": gt_meter,
            "error": error
        })
    
    # Create a DataFrame from the results and save to CSV.
    df = pd.DataFrame(results)
    csv_filename = f"res/{save_name}.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
    
    return df

In [4]:
# Specify the path to your JSON test file and the directory where images are stored.
test_json_path = "./dataset_new/test.json"  # Adjust if necessary.
image_root = "/"                # Adjust if your images are located elsewhere.

# Load the processor (ensure this is the one that supports image input).
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


In [10]:
test_json_path = "./dataset_new/train.json"

In [5]:
# Load the original model.
old_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)
#old_model = old_model.to_empty(0)  # moves model from meta to GPU 0


print("=== Inference using the Original Model ===")
df_no_train = run_inference_on_dataset(old_model, processor, test_json_path, image_root, "llava1.5_no_finetune")
del old_model  # Clean up if needed.



Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.14it/s]


=== Inference using the Original Model ===


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: The gas station is approximately 100 meters away from the camera, rounded to the next meter.
Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 100.0
Ground Truth: 30.0
Error: 70.0
USER:  
How far is the post box away from the camera in meters, rounded to the next meter? ASSISTANT: The post box is approximately 10 meters away from the camera, rounded to the next meter.
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 10.0
Ground Truth: 10.0
Error: 0.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: The gas station is approximately 100 meters away from the camera.
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 100.0
Ground Truth: 27.0
Error: 73.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: The gas station is approximately 10 meters away from the camera

In [5]:
# Load the finetuned model.
new_model = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-custom",
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)

print("=== Inference using the Finetuned Model ===")
df_default_train = run_inference_on_dataset(new_model, processor, test_json_path, image_root, "llava1.5_no_finetune")
del new_model


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.57s/it]


=== Inference using the Finetuned Model ===
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 22 
Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 22.0
Ground Truth: 30.0
Error: 8.0
USER:  
How far is the post box away from the camera in meters, rounded to the next meter? ASSISTANT: 12 
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 12.0
Ground Truth: 10.0
Error: 2.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 20 
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 20.0
Ground Truth: 27.0
Error: 7.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 21 
Item ID: Tx4CWkxiDU1xV7-uZ6lLrw_right
Predicted Meter: 21.0
Ground Truth: 25.0
Error: 4.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 27 
Item ID: 8QVoY6NmGZBiZz1qi5irCQ_left
Predicted Meter: 27

In [7]:
# Load the finetuned model.
new_model = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-custom",
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)

print("=== Inference using the Finetuned Model ===")
df_custom_train = run_inference_on_dataset(new_model, processor, test_json_path, image_root, "llava1.5_finetune")
del new_model


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.10s/it]


=== Inference using the Finetuned Model ===
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 22 
Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 22.0
Ground Truth: 30.0
Error: 8.0
USER:  
How far is the post box away from the camera in meters, rounded to the next meter? ASSISTANT: 12 
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 12.0
Ground Truth: 10.0
Error: 2.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 20 
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 20.0
Ground Truth: 27.0
Error: 7.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 21 
Item ID: Tx4CWkxiDU1xV7-uZ6lLrw_right
Predicted Meter: 21.0
Ground Truth: 25.0
Error: 4.0
USER:  
How far is the gas station away from the camera in meters, rounded to the next meter? ASSISTANT: 27 
Item ID: 8QVoY6NmGZBiZz1qi5irCQ_left
Predicted Meter: 27

In [4]:
# Load the finetuned model.
new_model = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp",
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)

print("=== Inference using the Finetuned Model ===")
#df_custom_train = run_inference_on_dataset(new_model, processor, test_json_path, image_root, "inference_custom_finetune_exp")
#del new_model


Loading checkpoint shards: 100%|██████████████████| 3/3 [00:14<00:00,  4.82s/it]
Loading adapter weights from /hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp led to unexpected keys not found in the model:  ['bruh_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.o_proj.lora_A.default.weight', 'bruh_model.language_model.mode

=== Inference using the Finetuned Model ===


In [6]:
new_model

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out

In [5]:
class LlavaRegressionModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model  
        self.regression_head = nn.Linear(4096, 1)
        # Cast regression_head to the same dtype as the base model (likely float16)
        self.regression_head = self.regression_head.to(next(base_model.parameters()).dtype)
        
        # Copy public attributes from base_model to self
        for attr in dir(base_model):
            if not attr.startswith('__') and not hasattr(self, attr):
                try:
                    setattr(self, attr, getattr(base_model, attr))
                except Exception:
                    pass

    def forward(self, input_ids, attention_mask=None, pixel_values=None, **kwargs):
        kwargs["output_hidden_states"] = True
        print("YES")
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            **kwargs
        )
        hidden_states = outputs.hidden_states[-1]
        pooled = hidden_states.mean(dim=1)
        regression_output = self.regression_head(pooled)
        return regression_output


In [6]:
def remap_checkpoint(checkpoint):
    new_checkpoint = {}
    prefix_to_remove = "base_model.model."
    for key, value in checkpoint.items():
        new_key = key
        # Remove the prefix "base_model.model." if present.
        if new_key.startswith(prefix_to_remove):
            new_key = new_key[len(prefix_to_remove):]
        # Replace "bruh_model" with "base_model" at the beginning.
        if new_key.startswith("bruh_model"):
            new_key = "base_model" + new_key[len("bruh_model"):]
        # Rename keys ending with ".weight" to end with ".default.weight"
        # but skip keys that contain "regression_head".
        if "regression_head" not in new_key and new_key.endswith(".weight") and not new_key.endswith(".default.weight"):
            new_key = new_key[:-len(".weight")] + ".default.weight"
        new_checkpoint[new_key] = value
    return new_checkpoint


In [11]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test/adapter_model.safetensors"

old_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)

model = LlavaRegressionModel(old_model)

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
finetuned_model = model.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.39s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 31.74 GiB of which 11.38 MiB is free. Including non-PyTorch memory, this process has 31.72 GiB memory in use. Of the allocated memory 31.42 GiB is allocated by PyTorch, and 9.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp/checkpoint-1368"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp/checkpoint-1368/adapter_model.safetensors"

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



# Load the finetuned model (adapter weights load into the base model with hierarchical keys).
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

# Load the adapter checkpoint directly onto the GPU (cuda:0)
adapter_checkpoint = load_file(adapter_model_path, device="cpu")

print("Checkpoint keys:")
for key in sorted(adapter_checkpoint.keys()):
    print(key)

remapped_checkpoint = remap_checkpoint(adapter_checkpoint)

print("Checkpoint keys:")
for key in sorted(remapped_checkpoint.keys()):
    print(key)

regression_model = LlavaRegressionModel(finetuned_model)

# Load the remapped state dict into the base model.
regression_model.load_state_dict(remapped_checkpoint, strict=False)
#regression_model.load_state_dict(adapter_checkpoint, strict=False)

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


OSError: Incorrect path_or_model_id: '/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp/checkpoint-1368'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [12]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False/adapter_model.safetensors"

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



# Load the finetuned model (adapter weights load into the base model with hierarchical keys).
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

# Load the adapter checkpoint directly onto the GPU (cuda:0)
adapter_checkpoint = load_file(adapter_model_path, device="cpu")

print("Checkpoint keys:")
for key in sorted(adapter_checkpoint.keys()):
    print(key)

remapped_checkpoint = remap_checkpoint(adapter_checkpoint)

print("Checkpoint keys:")
for key in sorted(remapped_checkpoint.keys()):
    print(key)

# Load the remapped state dict into the base model.
finetuned_model.load_state_dict(remapped_checkpoint, strict=False)


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.69s/it]


Checkpoint keys:
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.v_proj.lora_

_IncompatibleKeys(missing_keys=['vision_tower.vision_model.embeddings.class_embedding', 'vision_tower.vision_model.embeddings.patch_embedding.weight', 'vision_tower.vision_model.embeddings.position_embedding.weight', 'vision_tower.vision_model.pre_layrnorm.weight', 'vision_tower.vision_model.pre_layrnorm.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'vi

In [41]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False/adapter_model.safetensors"

# Load the adapter checkpoint directly onto the GPU (cuda:0)
adapter_checkpoint = load_file(adapter_model_path, device="cpu")

print("Checkpoint keys:")
for key in sorted(adapter_checkpoint.keys()):
    print(key)

Checkpoint keys:
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.v_proj.lora_

In [7]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False/adapter_model.safetensors"

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



# Load the finetuned model (adapter weights load into the base model with hierarchical keys).
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

# Load the adapter checkpoint directly onto the GPU (cuda:0)
adapter_checkpoint = load_file(adapter_model_path, device="cpu")

print("Checkpoint keys:")
for key in sorted(adapter_checkpoint.keys()):
    print(key)

remapped_checkpoint = remap_checkpoint(adapter_checkpoint)

print("Checkpoint keys:")
for key in sorted(remapped_checkpoint.keys()):
    print(key)

# Load the remapped state dict into the base model.
finetuned_model.load_state_dict(remapped_checkpoint, strict=False)


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:08<00:00,  2.93s/it]


Checkpoint keys:
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.down_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.gate_proj.lora_B.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_A.weight
base_model.model.language_model.model.layers.0.mlp.up_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.k_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.o_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_A.weight
base_model.model.language_model.model.layers.0.self_attn.q_proj.lora_B.weight
base_model.model.language_model.model.layers.0.self_attn.v_proj.lora_

_IncompatibleKeys(missing_keys=['vision_tower.vision_model.embeddings.class_embedding', 'vision_tower.vision_model.embeddings.patch_embedding.weight', 'vision_tower.vision_model.embeddings.position_embedding.weight', 'vision_tower.vision_model.pre_layrnorm.weight', 'vision_tower.vision_model.pre_layrnorm.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'vi

In [32]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp"
adapter_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp/adapter_model.safetensors"

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



# Load the finetuned model (adapter weights load into the base model with hierarchical keys).
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

# Load the adapter checkpoint directly onto the GPU (cuda:0)
adapter_checkpoint = load_file(adapter_model_path, device="cpu")

print("Checkpoint keys:")
for key in sorted(adapter_checkpoint.keys()):
    print(key)

remapped_checkpoint = remap_checkpoint(adapter_checkpoint)

print("Checkpoint keys:")
for key in sorted(remapped_checkpoint.keys()):
    print(key)

regression_model = LlavaRegressionModel(finetuned_model).to(0)

# Load the remapped state dict into the base model.
regression_model.load_state_dict(remapped_checkpoint, strict=False)


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:11<00:00,  3.84s/it]
Loading adapter weights from /hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-exp led to unexpected keys not found in the model:  ['bruh_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_B.default.weight', 'bruh_model

Checkpoint keys:
base_model.model.bruh_model.language_model.model.layers.0.mlp.down_proj.lora_A.weight
base_model.model.bruh_model.language_model.model.layers.0.mlp.down_proj.lora_B.weight
base_model.model.bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_A.weight
base_model.model.bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_B.weight
base_model.model.bruh_model.language_model.model.layers.0.mlp.up_proj.lora_A.weight
base_model.model.bruh_model.language_model.model.layers.0.mlp.up_proj.lora_B.weight
base_model.model.bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_A.weight
base_model.model.bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_B.weight
base_model.model.bruh_model.language_model.model.layers.0.self_attn.o_proj.lora_A.weight
base_model.model.bruh_model.language_model.model.layers.0.self_attn.o_proj.lora_B.weight
base_model.model.bruh_model.language_model.model.layers.0.self_attn.q_proj.lora_A.weight
base_model.model.bruh_mode

_IncompatibleKeys(missing_keys=['base_model.vision_tower.vision_model.embeddings.class_embedding', 'base_model.vision_tower.vision_model.embeddings.patch_embedding.weight', 'base_model.vision_tower.vision_model.embeddings.position_embedding.weight', 'base_model.vision_tower.vision_model.pre_layrnorm.weight', 'base_model.vision_tower.vision_model.pre_layrnorm.bias', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.base_layer.weight', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.base_layer.bias', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.lora_A.default.weight', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.lora_B.default.weight', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.base_layer.weight', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.base_layer.bias', 'base_model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.lora

In [34]:
# Load the regression head state dict and capture the result.
incompat_keys = regression_model.load_state_dict(remapped_checkpoint, strict=False)

# Get all keys expected by the regression head.
expected_keys = set(regression_model.state_dict().keys())

# The keys that were successfully loaded are those that are in expected_keys but not in missing_keys.
loaded_keys = expected_keys - set(incompat_keys.missing_keys)

print("Successfully loaded keys:")
for key in sorted(loaded_keys):
    print(key)

print("\nMissing keys:")
print(incompat_keys.missing_keys)

print("\nUnexpected keys in the checkpoint:")
print(incompat_keys.unexpected_keys)


Successfully loaded keys:
base_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight
base_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight
base_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight
base_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight
base_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight
base_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight
base_model.language_model.model.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.language_model.model.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.language_model.model.layers.0.self_attn.o_proj.lora_A.default.weight
base_model.language_model.model.layers.0.self_attn.o_proj.lora_B.default.weight
base_model.language_model.model.layers.0.self_attn.q_proj.lora_A.default.weight
base_model.language_model.model.layers.0.self_attn.q_proj.lora_B.default.weight
base_model.language_model.model.laye

In [38]:
regression_model.regression_head.weight

Parameter containing:
tensor([[-0.0031,  0.0051,  0.0101,  ..., -0.0048, -0.0114, -0.0093]],
       dtype=torch.float16, requires_grad=True)

In [31]:
del finetuned_model
del regression_model

In [14]:
from safetensors.torch import load_file
cache_dir = "/hpi/fs00/scratch/liudvikas.zekas/.cache"
finetuned_model_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test"

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# Load the finetuned model (adapter weights load into the base model with hierarchical keys).
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    cache_dir=cache_dir
).to(0)

print(finetuned_model)

finetuned_model.load_adapter("/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test")
regression_model = LlavaRegressionModel(finetuned_model).to(0)

results = run_inference_on_dataset(
    regression_model,
    processor,
    test_json_path,
    image_root,
    "inference_custom_finetune"
)

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:03<00:00,  1.26s/it]


LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q

Loading adapter weights from /hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test led to unexpected keys not found in the model:  ['base_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.self_attn.k_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.self_attn.o_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.self_attn.o_proj.lora_B.default.weight', 'base_model.language_model.m

tensor([[0.3733]], device='cuda:0', dtype=torch.float16)
torch.Size([1, 1])
Predicted meter: 0.373291015625
Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 0.373291015625
Ground Truth: 30.0
Error: 29.626708984375
tensor([[0.7969]], device='cuda:0', dtype=torch.float16)
torch.Size([1, 1])
Predicted meter: 0.796875
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 0.796875
Ground Truth: 10.0
Error: 9.203125
tensor([[0.4827]], device='cuda:0', dtype=torch.float16)
torch.Size([1, 1])
Predicted meter: 0.482666015625
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 0.482666015625
Ground Truth: 27.0
Error: 26.517333984375
tensor([[0.1749]], device='cuda:0', dtype=torch.float16)
torch.Size([1, 1])
Predicted meter: 0.1749267578125
Item ID: Tx4CWkxiDU1xV7-uZ6lLrw_right
Predicted Meter: 0.1749267578125
Ground Truth: 25.0
Error: 24.8250732421875
tensor([[0.5581]], device='cuda:0', dtype=torch.float16)
torch.Size([1, 1])
Predicted meter: 0.55810546875
Item ID: 8QVoY6NmGZBiZz1qi5irCQ

In [17]:
dic = torch.load("/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test/adapter_model.safetensors", weights_only = False)

UnpicklingError: could not find MARK

In [14]:
finetuned_model1 = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test",
    torch_dtype=torch.float16,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)
finetuned_model2 = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-custom",
    torch_dtype=torch.float16,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)
print(finetuned_model1)
print(finetuned_model2)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.59s/it]
Loading adapter weights from /hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test led to unexpected keys not found in the model:  ['base_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight', 'base_model.language_model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'base_model.language_model.model.layers.0.self_attn.k_proj.lora_B.default.

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q

In [None]:
# Load the finetuned model as usual.
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-test",
    torch_dtype=torch.float16,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
).to(0)

# Extract the base model (which now has the adapter weights properly loaded)
base_model = finetuned_model.base_model

# Wrap the base model in your flattened regression model.
regression_model = LlavaRegressionModel(base_model).to(0)

# Now run inference using your regression_model.
print("=== Inference using the Finetuned Regression Model ===")
df_custom_train = run_inference_on_dataset(
    regression_model,
    processor,
    test_json_path,
    image_root,
    "inference_custom_finetune"
)

# Optionally, clean up.
del finetuned_model, regression_model


In [12]:
regression_model = model.to(0)

# Now run inference using your regression_model.
print("=== Inference using the Finetuned Regression Model ===")
df_custom_train = run_inference_on_dataset(
    regression_model,
    processor,
    test_json_path,
    image_root,
    "inference_custom_finetune_BRUH"
)

=== Inference using the Finetuned Regression Model ===


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


AttributeError: 'LlavaCausalLMOutputWithPast' object has no attribute 'item'

In [11]:
# Now run inference using your regression_model.
print("=== Inference using the Finetuned Regression Model ===")
df_custom_train = run_inference_on_dataset(
    regression_model,
    processor,
    test_json_path,
    image_root,
    "inference_custom_finetune_BRUH"
)

=== Inference using the Finetuned Regression Model ===
Predicted meter: 16.6875
Item ID: YeeOSjXEgLAKHQEV8zyT8Q_right
Predicted Meter: 16.6875
Ground Truth: 20.0
Error: 3.3125
Predicted meter: 16.125
Item ID: RypdZclJzYYuJ9lrRJB97w_left
Predicted Meter: 16.125
Ground Truth: 21.0
Error: 4.875
Predicted meter: 17.65625
Item ID: if_6Golbk4UT-lUBDpzmJg_right
Predicted Meter: 17.65625
Ground Truth: 28.0
Error: 10.34375
Predicted meter: 17.0
Item ID: 2QJJIdNRdpXXRcTbY5LzWA_right
Predicted Meter: 17.0
Ground Truth: 29.0
Error: 12.0
Predicted meter: 16.046875
Item ID: 51dJTqrKEv54IA11ZYtLvg_back
Predicted Meter: 16.046875
Ground Truth: 25.0
Error: 8.953125
Predicted meter: 18.15625
Item ID: KD0EjEFJVvU_nX9Vlk2pQQ_front
Predicted Meter: 18.15625
Ground Truth: 28.0
Error: 9.84375
Predicted meter: 15.71875
Item ID: VIQ_5CQZtl5hvS4FSVF79g_right
Predicted Meter: 15.71875
Ground Truth: 21.0
Error: 5.28125
Predicted meter: 15.765625
Item ID: OA6YEEF9yFSiwDJmnpW-vA_back
Predicted Meter: 15.765625
Grou


KeyboardInterrupt



In [50]:
df_custom_train['error'].mean()

np.float64(6.964163822525597)

In [7]:
print(df_no_train['error'].mean())
print(df_default_train['error'].mean())
print(df_custom_train['error'].mean())


53.419795221843
4.273037542662116
3.7610921501706485


In [36]:
del finetuned_model
del regression_model

In [7]:
import argparse

import torch
import transformers
from peft import PeftModel
from transformers import AutoProcessor, LlavaForConditionalGeneration, PreTrainedTokenizer, AutoConfig

In [8]:
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", add_eos_token=True)
tokenizer = processor.tokenizer
model = LlavaForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp",
    torch_dtype=torch.float16,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)
config = AutoConfig.from_pretrained("/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp")

#model, tokenizer, processor, config = loader.load(args.load_model)
print(model)

print("Loading LoRA weights...")
model = PeftModel.from_pretrained(model, "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp")
print("Merging LoRA weights...")
model = model.merge_and_unload()
print("Model is loaded...")
#model.save_pretrained(model_save_path)

#tokenizer.save_pretrained(model_save_path)
#processor.save_pretrained(model_save_path)

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.76it/s]
Loading adapter weights from /hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.5-7b_lora-True_qlora-False-acc-exp led to unexpected keys not found in the model:  ['bruh_model.language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.gate_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_A.default.weight', 'bruh_model.language_model.model.layers.0.mlp.up_proj.lora_B.default.weight', 'bruh_model.language_model.model.layers.0.self_attn.k_proj.lora_A.def

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out

In [10]:
model_save_path = "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/new_model"
model.save_pretrained(model_save_path)

tokenizer.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)

[2025-03-06 03:00:49,456] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/hpi/fs00/home/liudvikas.zekas/miniconda3/envs/lmms-finetune/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


UnboundLocalError: local variable 'active_adapters' referenced before assignment

In [11]:
model

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q