In [87]:
import re
import pandas as pd

def parse_raw_json(raw):
    if not raw or pd.isna(raw):
        return {"answer": None, "confidence": None, "rationale": None}

    raw = str(raw)

    # Step 1: Extract the 'raw' value if it's a dict-like wrapper
    raw_match = re.search(r"'raw':\s*(.*)", raw, re.DOTALL)
    if raw_match:
        raw = raw_match.group(1).strip()
        # Remove trailing comma or closing brace if present
        raw = re.sub(r"[},]\s*$", "", raw)

    # Step 2: Remove ```json or ```
    raw = re.sub(r"```(?:json)?", "", raw)

    # Step 3: Normalize whitespace and line breaks
    raw = raw.replace("\n", " ").replace("\t", " ").strip()

    # Step 4: Convert single quotes to double quotes for simple key/value parsing
    raw = raw.replace("'", '"')

    # Step 5: Try to extract fields manually using regex
    def extract_field(name):
        pattern = rf'"{name}"\s*:\s*"([^"]*?)"'
        match = re.search(pattern, raw, re.DOTALL)
        return match.group(1).strip() if match else None

    answer = extract_field("answer")
    confidence = extract_field("confidence")
    rationale = extract_field("rationale")

    # Step 6: Try to parse confidence as float
    try:
        confidence = float(confidence) if confidence is not None else None
    except:
        confidence = None

    return {
        "answer": answer,
        "confidence": confidence,
        "rationale": rationale
    }

In [None]:
import pandas as pd

models = ["llava-v1.6-mistral-7b-hf", "Qwen2.5-VL-7B-Instruct"]
datasets = ["cvr", "bp", "marsvqa", "raven"]
ver = "ver1"
strategies = ["classification", "direct", "contrastive", "descriptive"]
results = ["results"]

for dataset in datasets:
    for strategy in strategies:
        for model in models:
            for result in results:
                print(f"Processing: {dataset} | {strategy} | {model} | {ver}")
                df = pd.read_csv(
                    f"../results/{dataset}/{strategy}/{model}/{ver}/{result}.csv",
                    dtype={"problem_id": str}, 
                )

                df["problem_id"] = df["problem_id"].str.strip()

                mask = df["answer"].isna() | df["confidence"].isna() | df["rationale"].isna() | (df["answer"] == '')
                print(mask.sum(), "rows to fix")

                parsed = df.loc[mask, "raw_response"].apply(parse_raw_json)

                df.loc[mask, "answer"] = parsed.apply(lambda x: x["answer"])
                df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
                df.loc[mask, "rationale"] = parsed.apply(lambda x: x["rationale"])

                df.to_csv(f"../results/{dataset}/{strategy}/{model}/{ver}/{result}.csv", index=False)


Processing: cvr | classification | llava-v1.6-mistral-7b-hf | ver1
Processing: cvr | classification | Qwen2.5-VL-7B-Instruct | ver1
Processing: cvr | direct | llava-v1.6-mistral-7b-hf | ver1
Processing: cvr | direct | Qwen2.5-VL-7B-Instruct | ver1
Processing: cvr | contrastive | llava-v1.6-mistral-7b-hf | ver1
Processing: cvr | contrastive | Qwen2.5-VL-7B-Instruct | ver1
Processing: cvr | descriptive | llava-v1.6-mistral-7b-hf | ver1
Processing: cvr | descriptive | Qwen2.5-VL-7B-Instruct | ver1
Processing: bp | classification | llava-v1.6-mistral-7b-hf | ver1
Processing: bp | classification | Qwen2.5-VL-7B-Instruct | ver1
Processing: bp | direct | llava-v1.6-mistral-7b-hf | ver1
Processing: bp | direct | Qwen2.5-VL-7B-Instruct | ver1
Processing: bp | contrastive | llava-v1.6-mistral-7b-hf | ver1
Processing: bp | contrastive | Qwen2.5-VL-7B-Instruct | ver1
Processing: bp | descriptive | llava-v1.6-mistral-7b-hf | ver1
Processing: bp | descriptive | Qwen2.5-VL-7B-Instruct | ver1
Processi

  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.loc[mask, "confidence"] = parsed.apply(lambda x: x["confidence"])
  df.l