In [1]:
import os
import re
import json
import torch
import pandas as pd
from PIL import Image
import requests
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

# Function to extract the first occurrence of a number (with optional meter units)
def extract_meter(text):
    """
    Extracts the first occurrence of a number (optionally followed by 'meters', 'meter', or 'm')
    from the given text and returns it as a float.
    """
    match = re.search(r"(\d+(?:\.\d+)?)\s*(?:meters?|m)?", text, re.IGNORECASE)
    if match:
        try:
            return float(match.group(1))
        except Exception as e:
            print("Error converting extracted value:", e)
    return None

def inference_llava16_image(model, processor, image_source, human_text, use_url=False):
    """
    Run inference on a single image using Llava v1.6.
    
    Args:
        model: The LlavaNextForConditionalGeneration model.
        processor: The LlavaNextProcessor for formatting the prompt and image.
        image_source (str): The image URL (if use_url is True) or local file path.
        human_text (str): The text prompt (with any "<image>" marker removed).
        use_url (bool): Whether the image_source is a URL.
    
    Returns:
        output_text (str): The generated answer from the model.
    """
    # Load image from URL or from local file.
    if use_url:
        image = Image.open(requests.get(image_source, stream=True).raw).convert("RGB")
    else:
        image = Image.open(image_source).convert("RGB")
    
    # Build the conversation template:
    # Text prompt first and an image placeholder second.
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": human_text},
                {"type": "image"},
            ],
        },
    ]
    
    # Build the prompt using the processor's chat template.
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    # Prepare the inputs.
    inputs = processor(images=image, text=prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    
    # Generate model output.
    output_ids = model.generate(**inputs, max_new_tokens=100)
    output_text = processor.decode(output_ids[0], skip_special_tokens=True)
    return output_text

def run_llava16_inference_on_dataset(model, processor, dataset_path, image_root, save_name, use_url=False):
    """
    Iterate over items in the test JSON file, run inference with Llava v1.6, extract meter values,
    compute the absolute error against the ground truth, and save the results as a CSV file.
    
    Args:
        model: The LlavaNextForConditionalGeneration model.
        processor: The LlavaNextProcessor.
        dataset_path (str): Path to the JSON file (e.g., "./dataset_new/test.json").
        image_root (str): Root directory for images (if not using URLs).
        save_name (str): Base name for the output CSV file.
        use_url (bool): Whether the image paths in the dataset are URLs.
    
    Returns:
        df: A pandas DataFrame with columns: id, predicted_meter, ground_truth_meter, and error.
    """
    results = []
    
    with open(dataset_path, "r") as f:
        data = json.load(f)
    
    for item in data:
        item_id = item.get("id")
        image_file = item["image"]
        image_source = image_file if use_url else os.path.join(image_root, image_file.lstrip("/"))
        
        # Extract the human prompt from the conversation.
        human_text = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "human":
                human_text = conv.get("value", "")
                break
        # Remove the "<image>" marker if present.
        if human_text.startswith("<image>"):
            human_text = human_text[len("<image>"):].strip()
        
        # Extract the ground truth answer from the first "gpt" conversation.
        ground_truth_str = ""
        for conv in item.get("conversations", []):
            if conv.get("from", "").lower() == "gpt":
                ground_truth_str = conv.get("value", "")
                break
        
        try:
            model_output = inference_llava16_image(model, processor, image_source, human_text, use_url=use_url)
        except Exception as e:
            print(f"Error processing item {item_id}: {e}")
            model_output = ""
        
        pred_meter = extract_meter(model_output)
        gt_meter = extract_meter(ground_truth_str)
        error = abs(pred_meter - gt_meter) if (pred_meter is not None and gt_meter is not None) else None
        
        print(f"Item ID: {item_id}")
        print("Predicted Meter:", pred_meter)
        print("Ground Truth:", gt_meter)
        print("Error:", error)
        print("=" * 50)
        
        results.append({
            "id": item_id,
            "predicted_meter": pred_meter,
            "ground_truth_meter": gt_meter,
            "error": error
        })
    
    # Save the results as a CSV file.
    df = pd.DataFrame(results)
    csv_filename = f"res/{save_name}.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
    
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-vicuna-7b-hf",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)
model.to("cuda:0")

# --- Example: Run Inference on a Dataset ---
# Uncomment and modify the following line to run inference on your dataset.
df_results = run_llava16_inference_on_dataset(model, processor, "./dataset_new/test.json", "/", "llava1.6_no_finetune", use_url=False)


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Downloading shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:08<00:00, 22.74s/it]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.49it/s]
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 100.0
Ground Truth: 30.0
Error: 70.0
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 10.0
Ground Truth: 10.0
Error: 0.0
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 10.0
Ground Truth: 27.0
Error: 17.0
Item ID: Tx4CWkxiDU1xV7-uZ6lLrw_right
Predicted Meter: None
Ground Truth: 25.0
Error: None
Item ID: 8QVoY6NmGZBiZz1qi5irCQ_left
Predicted Meter: 100.0
Ground Truth: 25.0
Error: 75.0
Item ID: BBpzL8ql3kRAEMJpc1_HGw_right
Predicted Meter: 700.0
Ground Truth: 24.0
Error: 676.0
Item ID: r-yGTE7GDoTFE49DyCJOQw_front
Predicted Meter: 100.0
Ground Truth: 25.0
Error: 75.0
Item ID: eC7O22NYV9d6KWGMa4vR7A_front
Predicted Meter: 339.99
Ground Truth: 21.0
Error: 318.99
Item ID: mGq1XR8eBPbK9_J0gOAOlQ_back
Predicted Meter: 100.0
Ground Truth: 26.0
Error: 74.0
Item ID: 8oe-e2AHXuj3E1JMMp7LUQ_back
Predicted Meter: 100.0
Ground Truth: 20.0
Error: 80.0
Item ID: Aita8sR2NtHUJE-mmnR5eA_right
Predicted Meter: 2.0
Ground Truth: 26.0
Error: 

In [6]:
del model

In [7]:
#processor = LlavaNextProcessor.from_pretrained("llava-v1.6-vicuna-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.6-vicuna-7b_lora-True_qlora-False-custom",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)
model.to("cuda:0")

# --- Example: Run Inference on a Dataset ---
# Uncomment and modify the following line to run inference on your dataset.
df_results_finetuned = run_llava16_inference_on_dataset(model, processor, "./dataset_new/test.json", "/", "llava1.6_finetune", use_url=False)
del model

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.72s/it]


Item ID: ugebwzaohO3D7XbW343_qA_back
Predicted Meter: 25.0
Ground Truth: 30.0
Error: 5.0
Item ID: qz33ZhqXW5DY-2hPzv2CMQ_back
Predicted Meter: 12.0
Ground Truth: 10.0
Error: 2.0
Item ID: z3ndUO5NBsQdcR6Onfs8kw_right
Predicted Meter: 20.0
Ground Truth: 27.0
Error: 7.0
Item ID: Tx4CWkxiDU1xV7-uZ6lLrw_right
Predicted Meter: None
Ground Truth: 25.0
Error: None
Item ID: 8QVoY6NmGZBiZz1qi5irCQ_left
Predicted Meter: 23.0
Ground Truth: 25.0
Error: 2.0
Item ID: BBpzL8ql3kRAEMJpc1_HGw_right
Predicted Meter: 27.0
Ground Truth: 24.0
Error: 3.0
Item ID: r-yGTE7GDoTFE49DyCJOQw_front
Predicted Meter: 27.0
Ground Truth: 25.0
Error: 2.0
Item ID: eC7O22NYV9d6KWGMa4vR7A_front
Predicted Meter: 30.0
Ground Truth: 21.0
Error: 9.0
Item ID: mGq1XR8eBPbK9_J0gOAOlQ_back
Predicted Meter: 23.0
Ground Truth: 26.0
Error: 3.0
Item ID: 8oe-e2AHXuj3E1JMMp7LUQ_back
Predicted Meter: 20.0
Ground Truth: 20.0
Error: 0.0
Item ID: Aita8sR2NtHUJE-mmnR5eA_right
Predicted Meter: 23.0
Ground Truth: 26.0
Error: 3.0
Item ID: UFsdG

In [None]:
finetuned_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "/hpi/fs00/scratch/liudvikas.zekas/checkpoints/llava-1.6-vicuna-7b_lora-True_qlora-False-custom",  # Adjust path as necessary.
    torch_dtype="auto",
    device_map="auto",
    cache_dir="/hpi/fs00/scratch/liudvikas.zekas/.cache"
)

print("=== Inference using the Finetuned Qwen Model ===")
df_finetuned = run_inference_on_dataset(finetuned_model, processor, test_json_path, image_root, "qwen2_2b_inference")

# Clean up.
del finetuned_model