In [139]:
import re
import pandas as pd

def parse_raw_json(raw):
    if not raw or pd.isna(raw):
        return {"answer": None, "confidence": None, "rationale": None}

    raw = str(raw)

    # Step 1: Extract the 'raw' value if it's a dict-like wrapper
    raw_match = re.search(r"'raw':\s*(.*)", raw, re.DOTALL)
    if raw_match:
        raw = raw_match.group(1).strip()
        # Remove trailing comma or closing brace if present
        raw = re.sub(r"[},]\s*$", "", raw)

    # Step 2: Remove ```json or ```
    raw = re.sub(r"```(?:json)?", "", raw)

    # Step 3: Normalize whitespace and line breaks
    raw = raw.replace("\n", " ").replace("\t", " ").strip()

    # Step 4: Convert single quotes to double quotes for simple key/value parsing
    raw = raw.replace("'", '"')

    # Step 5: Try to extract fields manually using regex
    def extract_field(name):
        pattern = rf'"{name}"\s*:\s*"([^"]*?)"'
        match = re.search(pattern, raw, re.DOTALL)
        return match.group(1).strip() if match else None

    answer = extract_field("answer")
    confidence = extract_field("confidence")
    rationale = extract_field("rationale")

    # Step 6: Try to parse confidence as float
    try:
        confidence = float(confidence) if confidence is not None else None
    except:
        confidence = None

    return {
        "answer": answer,
        "confidence": confidence,
        "rationale": rationale
    }

In [None]:
import pandas as pd

models = ["llava-v1.6-mistral-7b-hf", "Qwen2.5-VL-7B-Instruct", "InternVL3-8B"]
datasets = ["cvr", "bp", "marsvqa", "raven"]
vers = ["ver1", "ver2", "ver3"]
strategies = ["classification", "direct", "contrastive", "descriptive"]
results = ["results", "evaluation_results", "all_results_concat"]

for dataset in datasets:
    for strategy in strategies:
        for model in models:
            for ver in vers:
                for result in results:
                    if result == "all_results_concat":
                        path = f"../results/{result}.csv"
                    else:
                        path = f"../results/{dataset}/{strategy}/{model}/{ver}/{result}.csv"
                    try:
                        df = pd.read_csv(
                            path,
                            dtype={"problem_id": str}, 
                        )
                    except FileNotFoundError:
                        print(f"File not found: {path}, trying alternative path.")
                        continue

                    df["problem_id"] = df["problem_id"].str.strip()

                    if "reasoning" in df.columns and "judge_rationale" not in df.columns:
                        df.rename(columns={"reasoning": "judge_rationale"}, inplace=True)

                    mask = df["answer"].isna() | (df["answer"] == '')

                    parsed = df.loc[mask, "raw_response"].apply(parse_raw_json)

                    df.loc[mask, "answer"] = parsed.apply(lambda x: x["answer"])
                    df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
                    df.loc[mask, "rationale"] = parsed.apply(lambda x: x["rationale"])

                    df.to_csv(path, index=False)


Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver1
9 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/classification/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/classification/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/classification/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/classification/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | classification | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver2
File not found: ../results/cvr/classification/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: cvr | classification | InternVL3-8B | ver2
File not found: ../results/cvr/classification/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | classification | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | classification | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/direct/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/direct/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/direct/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/direct/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | direct | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver2
File not found: ../results/cvr/direct/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: cvr | direct | InternVL3-8B | ver2
File not found: ../results/cvr/direct/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | direct | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | direct | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/contrastive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/contrastive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/contrastive/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/contrastive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver2
File not found: ../results/cvr/contrastive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: cvr | contrastive | InternVL3-8B | ver2
File not found: ../results/cvr/contrastive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | contrastive | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | contrastive | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/descriptive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/cvr/descriptive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/descriptive/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/cvr/descriptive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: cvr | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver2
File not found: ../results/cvr/descriptive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: cvr | descriptive | InternVL3-8B | ver2
File not found: ../results/cvr/descriptive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: cvr | descriptive | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: cvr | descriptive | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver1
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver1
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | classification | InternVL3-8B | ver2
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver2
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | classification | InternVL3-8B | ver3
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver3
0 rows to fix
Processing: bp | classification | InternVL3-8B | ver3
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver2
3 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver2
3 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver3
3 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver3
3 rows to fix
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver1
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver1
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | direct | InternVL3-8B | ver2
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver2
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | direct | InternVL3-8B | ver3
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver3
0 rows to fix
Processing: bp | direct | InternVL3-8B | ver3
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | contrastive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: bp | contrastive | InternVL3-8B | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver2
2 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver2
2 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: bp | descriptive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver2
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: bp | descriptive | InternVL3-8B | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/classification/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/classification/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | classification | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/classification/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/classification/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver2
File not found: ../results/marsvqa/classification/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: marsvqa | classification | InternVL3-8B | ver2
File not found: ../results/marsvqa/classification/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | classification | InternVL3-8B | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | classification | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | classification | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/direct/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/direct/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | direct | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/direct/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/direct/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | direct | Qwen2.5-VL-7B-Instruct | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | direct | InternVL3-8B | ver2
File not found: ../results/marsvqa/direct/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: marsvqa | direct | InternVL3-8B | ver2
File not found: ../results/marsvqa/direct/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | direct | InternVL3-8B | ver2
0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | direct | InternVL3-8B | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/contrastive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/contrastive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | contrastive | llava-v1.6-mistral-7b-hf | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/contrastive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | contrastive | Qwen2.5-VL-7B-Instruct | ver3


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | contrastive | InternVL3-8B | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | contrastive | InternVL3-8B | ver2
File not found: ../results/marsvqa/contrastive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: marsvqa | contrastive | InternVL3-8B | ver2
File not found: ../results/marsvqa/contrastive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | contrastive | InternVL3-8B | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/descriptive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/marsvqa/descriptive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver3
1 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver3
1 rows to fix
Processing: marsvqa | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver1


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/descriptive/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/marsvqa/descriptive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver2
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver1
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: marsvqa | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver2
File not found: ../results/marsvqa/descriptive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: marsvqa | descriptive | InternVL3-8B | ver2
File not found: ../results/marsvqa/descriptive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: marsvqa | descriptive | InternVL3-8B | ver2
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: marsvqa | descriptive | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/classification/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/classification/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | classification | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/classification/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/classification/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | classification | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | classification | InternVL3-8B | ver1
0 rows to fix
Processing: raven | classification | InternVL3-8B | ver1
0 rows to fix
Processing: raven | classification | InternVL3-8B | ver1
0 rows to fix
Processing: raven | classification | InternVL3-8B | ver2
File not found: ../results/raven/classification/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: raven | classification | InternVL3-8B | ver2
File not found: ../results/raven/classification/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | classification | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | classification | InternVL3-8B | ver3
0 rows to fix
Processing: raven | classification | InternVL3-8B | ver3
0 rows to fix
Processing: raven | classification | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/direct/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/direct/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | direct | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/direct/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/direct/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | direct | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | direct | InternVL3-8B | ver1
0 rows to fix
Processing: raven | direct | InternVL3-8B | ver1
0 rows to fix
Processing: raven | direct | InternVL3-8B | ver1
0 rows to fix
Processing: raven | direct | InternVL3-8B | ver2
File not found: ../results/raven/direct/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: raven | direct | InternVL3-8B | ver2
File not found: ../results/raven/direct/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | direct | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | direct | InternVL3-8B | ver3
0 rows to fix
Processing: raven | direct | InternVL3-8B | ver3
0 rows to fix
Processing: raven | direct | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/contrastive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/contrastive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | contrastive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/contrastive/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/contrastive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | contrastive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver2
File not found: ../results/raven/contrastive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: raven | contrastive | InternVL3-8B | ver2
File not found: ../results/raven/contrastive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | contrastive | InternVL3-8B | ver2
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver3
0 rows to fix
Processing: raven | contrastive | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver1
0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/descriptive/llava-v1.6-mistral-7b-hf/ver2/results.csv, trying alternative path.
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver2
File not found: ../results/raven/descriptive/llava-v1.6-mistral-7b-hf/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix
Processing: raven | descriptive | llava-v1.6-mistral-7b-hf | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver1
0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/descriptive/Qwen2.5-VL-7B-Instruct/ver2/results.csv, trying alternative path.
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver2
File not found: ../results/raven/descriptive/Qwen2.5-VL-7B-Instruct/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix
Processing: raven | descriptive | Qwen2.5-VL-7B-Instruct | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


Processing: raven | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver1
0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver2
File not found: ../results/raven/descriptive/InternVL3-8B/ver2/results.csv, trying alternative path.
Processing: raven | descriptive | InternVL3-8B | ver2
File not found: ../results/raven/descriptive/InternVL3-8B/ver2/evaluation_results.csv, trying alternative path.
Processing: raven | descriptive | InternVL3-8B | ver2


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver3
0 rows to fix
Processing: raven | descriptive | InternVL3-8B | ver3
0 rows to fix


  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])


# To run

In [130]:
import json
from pathlib import Path
from collections import OrderedDict


def infer_task_type(dataset_value: str) -> str:
    return "open-ended" if dataset_value == "bp" else "close-ended"


def insert_task_type(config: OrderedDict, task_type_value: str) -> OrderedDict:
    """
    Insert 'task_type' between 'image_format' and 'category'.
    """
    if "task_type" in config:
        return config

    new_config = OrderedDict()
    inserted = False

    for key, value in config.items():
        new_config[key] = value
        if key == "image_format":
            new_config["task_type"] = task_type_value
            inserted = True

    if not inserted:
        raise KeyError("'image_format' not found in config")

    return new_config


def insert_prompt_number(data: OrderedDict) -> OrderedDict:
    """
    Insert 'prompt_number' immediately after 'param_set_number'.
    """
    if "prompt_number" in data:
        return data

    new_data = OrderedDict()
    inserted = False

    for key, value in data.items():
        new_data[key] = value
        if key == "param_set_number":
            new_data["prompt_number"] = 1
            inserted = True

    if not inserted:
        raise KeyError("'param_set_number' not found")

    return new_data


def load_prompt(path: Path) -> str | None:

    if not path.is_file():
        # print("[WARN] File does not exist")
        return None

    try:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            text = f.read()

        return text
    except Exception as e:
        print("[ERROR] Manual read failed:", e)
        return None


def insert_example_prompts(
    data: OrderedDict,
    example_prompt: str | None,
    describe_example_prompt: str | None,
    contrast_example_prompt: str | None,
) -> OrderedDict:
    """
    Insert example prompts immediately after 'describe_prompt'
    and prevent later overwrites.
    """
    new_data = OrderedDict()
    inserted = False
    skip_keys = {
        "example_prompt",
        "describe_example_prompt",
        "contrast_example_prompt",
    }

    for key, value in data.items():
        # Skip old example keys (they will be reinserted correctly)
        if key in skip_keys:
            continue

        new_data[key] = value

        if key == "describe_prompt":
            new_data["example_prompt"] = example_prompt
            new_data["describe_example_prompt"] = describe_example_prompt
            new_data["contrast_example_prompt"] = contrast_example_prompt
            inserted = True

    if not inserted:
        raise KeyError("'describe_prompt' not found")

    return new_data



def process_metadata_file(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f, object_pairs_hook=OrderedDict)

    dataset = data.get("dataset")
    strategy = data.get("strategy")

    if dataset is None or strategy is None:
        print(f"[SKIP] Missing dataset or strategy: {path}")
        return

    if "config" not in data:
        print(f"[SKIP] No config section: {path}")
        return

    # Insert task_type
    task_type_value = infer_task_type(dataset)
    try:
        data["config"] = insert_task_type(data["config"], task_type_value)
    except KeyError as e:
        print(f"[ERROR] {e} in {path}")
        return

    # param_set_number
    param_set_updated = False
    if data.get("param_set_number") is None:
        data["param_set_number"] = 1
        param_set_updated = True

    # prompt_number
    data = insert_prompt_number(data)

    # Load example prompts
    base_prompt_dir = Path(f"../prompts/{dataset}/{strategy}")

    example_prompt = load_prompt(base_prompt_dir / "example_1.txt")
    describe_example_prompt = load_prompt(base_prompt_dir / "describe_example_1.txt")
    contrast_example_prompt = load_prompt(base_prompt_dir / "contrast_example_1.txt")

    # Insert example prompts
    try:
        data = insert_example_prompts(
            data,
            example_prompt,
            describe_example_prompt,
            contrast_example_prompt,
        )

    except KeyError as e:
        print(f"[ERROR] {e} in {path}")
        return

    with path.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"[UPDATED] {path}")


results_dir = Path("../results")
if not results_dir.exists():
    raise FileNotFoundError("results directory does not exist")

for metadata_path in results_dir.rglob("metadata.json"):
    process_metadata_file(metadata_path)


[UPDATED] ../results/bp/classification/llava-v1.6-mistral-7b-hf/ver1/metadata.json
[UPDATED] ../results/bp/classification/InternVL3-8B/ver1/metadata.json
[UPDATED] ../results/bp/classification/Qwen2.5-VL-7B-Instruct/ver1/metadata.json
[UPDATED] ../results/bp/direct/llava-v1.6-mistral-7b-hf/ver1/metadata.json
[UPDATED] ../results/bp/direct/llava-v1.6-mistral-7b-hf/ver2/metadata.json
[UPDATED] ../results/bp/direct/InternVL3-8B/ver1/metadata.json
[UPDATED] ../results/bp/direct/Qwen2.5-VL-32B-Instruct/ver1/metadata.json
[UPDATED] ../results/bp/direct/Qwen2.5-VL-7B-Instruct/ver1/metadata.json
[UPDATED] ../results/bp/contrastive/llava-v1.6-mistral-7b-hf/ver1/metadata.json
[UPDATED] ../results/bp/contrastive/InternVL3-8B/ver1/metadata.json
[UPDATED] ../results/bp/contrastive/Qwen2.5-VL-7B-Instruct/ver1/metadata.json
[UPDATED] ../results/bp/descriptive/llava-v1.6-mistral-7b-hf/ver1/metadata.json
[UPDATED] ../results/bp/descriptive/llava-v1.6-mistral-7b-hf/ver2/metadata.json
[UPDATED] ../result

In [114]:
import json
from pathlib import Path
from collections import OrderedDict

def infer_task_type(dataset_value: str) -> str:
    """Return task_type based on dataset"""
    return "open-ended" if dataset_value == "bp" else "close-ended"

def insert_task_type(config: OrderedDict, task_type_value: str) -> OrderedDict:
    """Insert task_type between image_format and category if missing"""
    if "task_type" in config:
        return config

    new_config = OrderedDict()
    inserted = False
    for key, value in config.items():
        new_config[key] = value
        if key == "image_format":
            new_config["task_type"] = task_type_value
            inserted = True

    if not inserted:
        raise KeyError("'image_format' not found in config")
    return new_config

def process_member(member: OrderedDict) -> bool:
    """
    Update a single member dictionary.
    Returns True if any change was made.
    """
    changed = False
    dataset = member.get("dataset")
    if not dataset:
        return False

    # Update param_set_number
    if member.get("param_set_number") is None:
        member["param_set_number"] = 1
        changed = True

    # Update config
    if "config" in member:
        try:
            new_config = insert_task_type(member["config"], infer_task_type(dataset))
            if new_config != member["config"]:
                member["config"] = new_config
                changed = True
        except KeyError as e:
            print(f"[ERROR] {e} in member with dataset {dataset}")
    return changed

def process_ensemble_file(file_path: Path, dry_run: bool = False):
    """Process one ensemble_config.json file"""
    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f, object_pairs_hook=OrderedDict)

    changed = False
    # Iterate over all members
    for key, value in data.items():
        if isinstance(value, dict) and key.startswith("member_"):
            if process_member(value):
                changed = True

    if not changed:
        print(f"[SKIP] No changes needed for {file_path}")
        return

    if dry_run:
        print(f"[DRY-RUN] Would update {file_path}")
        return

    with file_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"[UPDATED] {file_path}")

results_dir = Path("../results/ensembles")  
if not results_dir.exists():
    raise FileNotFoundError(f"{results_dir} does not exist")

for ensemble_path in results_dir.rglob("ensemble_config.json"):
    process_ensemble_file(ensemble_path, dry_run=False)


[SKIP] No changes needed for ../results/ensembles/bp/majority/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/bp/confidence/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/bp/reasoning_with_image/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/bp/reasoning/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/raven/majority/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/raven/confidence/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/raven/reasoning_with_image/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/raven/reasoning/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/cvr/majority/ensemble_ver1/ensemble_config.json
[SKIP] No changes needed for ../results/ensembles/cvr/confidence/ensemble_ver1/ensem

In [105]:
import os

df = pd.read_csv("../results/all_results_concat.csv", dtype={"problem_id": str})

path = "../results/bp/classification/Qwen2.5-VL-7B-Instruct/ver1/"
results_path = os.path.join(path, "results.csv")
evaluation_results_path = os.path.join(path, "evaluation_results.csv")
df1 = pd.read_csv(results_path, dtype={"problem_id": str})
df2 = pd.read_csv(evaluation_results_path, dtype={"problem_id": str})

import pandas as pd
import json

# JSON string
raw_json = '''{
  "answer": "A",
  "confidence": 0.9,
  "rationale": "In set A, all images on the left side share the same property: they are all composed of straight lines. This rule is applied consistently. On the right side, no image shares this property, confirming the rule's exclusivity. In set B, one image H on the right side also fits the property of being composed of straight lines, indicating a switch."
}'''

data = json.loads(raw_json)

# Boolean mask for the row
mask = (
    (df["problem_id"] == "087") &
    (df["dataset_name"] == "bp") &
    (df["model_name"] == "Qwen/Qwen2.5-VL-7B-Instruct") &
    (df["version"] == 1) &
    (df["strategy_name"] == "classification")
)

mask1 = df1["problem_id"] == "087"

# Select the row as a DataFrame
row_df = df.loc[mask]

# Populate the fields in the original df
df.loc[mask, ["answer", "rationale", "confidence"]] = [data["answer"], data["rationale"], data["confidence"]]
df1.loc[mask1, ["answer", "rationale", "confidence"]] = [data["answer"], data["rationale"], data["confidence"]]
df2.loc[mask1, ["answer", "rationale", "confidence"]] = [data["answer"], data["rationale"], data["confidence"]]
df2.loc[mask1, "score"] = "Right"

df.to_csv("../results/all_results_concat.csv", index=False)
df1.to_csv(results_path, index=False)
df2.to_csv(evaluation_results_path, index=False)

In [108]:
import os

path = "../results/bp/classification/Qwen2.5-VL-7B-Instruct/ver1/"

summary_path = os.path.join(path, "evaluation_results_summary.json")
metrics_path = os.path.join(path, "evaluation_results_metrics.json")

with open(metrics_path, "r", encoding="utf-8") as f:
    data = json.load(f)

if "bin_counts" in data and "No answer provided" in data["bin_counts"]:
    data["bin_counts"]["No answer provided"] = 0
    data["bin_counts"]["Right"] = 44

with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)


with open(summary_path, "r", encoding="utf-8") as f:
    data = json.load(f)

def replace_ones_with_zero(d: dict):
    for k, v in d.items():
        if v > 0:
            d[k] = 0

if "answers_completeness" in data:
    if "missing_count_per_column" in data["answers_completeness"]:
        replace_ones_with_zero(data["answers_completeness"]["missing_count_per_column"])

    if "row_ids_with_any_missing" in data["answers_completeness"]:
        data["answers_completeness"]["row_ids_with_any_missing"] = []

    if "missing_ratio_per_column" in data["answers_completeness"]:
        replace_ones_with_zero(data["answers_completeness"]["missing_ratio_per_column"])

with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)


In [112]:
import json
import os
from collections import OrderedDict

ensemble_dir = "../results/ensembles"

for root, dirs, files in os.walk(ensemble_dir):
    for file in files:
        if file == "ensemble_config.json":
            file_path = os.path.join(root, file)
            
            # Load JSON
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            
            new_data = OrderedDict()
            
            # Get member_0 info
            member0 = data.get("member_0", {})
            member0_config = member0.get("config", {})
            problem_description = member0.get("problem_description_prompt", "")
            sample_answer = member0.get("sample_answer_prompt", "")
            
            # 1. Insert task_type, category, dataset at the beginning
            for key in ["task_type", "category", "dataset"]:
                if key in member0_config:
                    new_data[key] = member0_config[key]
                elif key in member0:
                    new_data[key] = member0[key]
                    
            # 2. Process ensemble_model and main_prompt
            for key, value in data.items():
                if key == "ensemble_model" and value == "":
                    new_data[key] = "No judge model needed for this type and dataset"
                elif key == "main_prompt" and value:
                    # Replace placeholders with actual prompts
                    new_prompt = value.replace("$problem_description", problem_description)\
                                      .replace("$sample_answer", sample_answer)
                    new_data[key] = new_prompt
                else:
                    new_data[key] = value
            
            # Save updated JSON
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(new_data, f, indent=4)
            
            print(f"Updated: {file_path}")


Updated: ../results/ensembles/bp/majority/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/bp/confidence/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/bp/reasoning_with_image/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/bp/reasoning/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/raven/majority/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/raven/confidence/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/raven/reasoning_with_image/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/raven/reasoning/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/cvr/majority/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/cvr/confidence/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/cvr/reasoning_with_image/ensemble_ver1/ensemble_config.json
Updated: ../results/ensembles/cvr/reasoning/ensemble_ver1/ensemble_config.json
Updated: ../res

In [133]:
df = pd.read_csv("../results/all_results_concat.csv", dtype={"problem_id": str})
df = df.drop_duplicates(subset=["problem_id", "dataset_name", "model_name", "strategy_name", "version"], keep='last')
print(df[(df["problem_id"] == "001") & (df["dataset_name"] == "bp") & (df["strategy_name"] == "descriptive")].head(10))

df.to_csv("../results/all_results_concat.csv", index=False)

     problem_id                                             answer  \
1300        001                                          Left rule   
2799        001  {'left_rule': 'The image is entirely white wit...   
4896        001  The commonality among the images on the left i...   

      confidence                                          rationale  \
1300        0.90  The 'Left' images (0-5) are entirely white wit...   
2799        0.95  The 'Left' group consistently shows an empty w...   
4896        1.00  The left images have a single, solid-colored o...   

                                           raw_response           score  \
1300  {'answer': 'Left rule', 'confidence': 0.9, 'ra...           Right   
2799  {'answer': {'left_rule': 'The image is entirel...           Right   
4896  {'answer': 'The commonality among the images o...  Somewhat right   

                                         key type_name  ensemble  \
1300  ['Empty picture', 'Not empty picture']       NaN     False 