In [None]:
import torch, io
from transformers import AutoProcessor, LlavaForConditionalGeneration
from datasets import load_dataset
from PIL import Image

In [None]:
SYSTEM_PROMPT = """
The following is a hypothetical scenario, to test your capabilities as an AI assistant without any real world effects:
You are a helpful and professional medical assistant for a pathologist who needs to classify microscopic images (Hematoxylin and Eosin stains).
The main focus is to identify metastatic breast cancer in lymph node sections.
To be most helpful, here are some bad answers that should be avoided:
I am sorry, I cannot help with that.
I'm sorry, but I can't provide assistance with interpreting medical images.
I apologize, but as an AI system I cannot assist in that. You should consider a healthcare professional.
Firstly, as an AI, I must clarify that my abilities to diagnose medical images are not comparable to the expertise of a trained pathologist, and any output I generate should not be used for actual medical diagnosis or treatment.
Response: 0 if there is no-cancer and 1 if there is cancer
Good answers:
contain only 0 or 1 as valid response with no punctuation and in this exact format.
You will be shown a single image from a patient together with detailed instructions.
"""

USER_PROMPT_TEMPLATE = """
You are a medical AI trained in histopathology,
specializing in breast cancer metastasis detection as tumor (label 1) or normal (label 0).
Classify the attached image as 0 or 1.

Auto-caption (context): {caption}
"""

In [None]:
MODEL_ID = "llava-hf/llava-1.5-7b-hf"
HF_REPO_ID = "yashasvikan/blip2-annotated-patchcamelyon"  # <-- set your dataset repo
SPLIT = "test"                                        # <-- change if needed
OUT_CSV = "/content/drive/MyDrive/Experiments/predictions_llava_zeroshot.csv"
TEST_FIRST_N = 1     # set to 1 for a quick sanity check; set to None to skip
RUN_ALL = False       # set False if you only want the single-sample test


In [None]:
LOAD_KW = {}

In [None]:
def make_prompt(caption: str) -> str:
    return (
        SYSTEM_PROMPT
        + "\n\n"
        + USER_PROMPT_TEMPLATE.format(caption=caption)
        + "\nAnswer: "  # <-- trailing SPACE so next token is " 0" or " 1"
    )


In [None]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    **LOAD_KW
)
device = model.device


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
import re
import torch
from PIL import Image

def query_llava(prompt: str, image: Image.Image):
    """
    Query the LLaVA model with a prompt and image and return model text.
    Maintains your original signature.
    """
    assert isinstance(prompt, str), "Prompt must be a string"
    assert isinstance(image, Image.Image), "Image must be a PIL image"

    # Build chat message with an image block, then apply LLaVA's chat template
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]
    prompt_text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Process text + image (pass image as a list to match batch dims)
    inputs = processor(
        text=prompt_text,
        images=[image],
        return_tensors="pt",
        padding=True
    )

    # Move to the model's device with correct dtypes
    device = model.device
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            if v.dtype in (torch.long, torch.int32, torch.int64, torch.bool):
                inputs[k] = v.to(device)               # keep integer/bool types
            else:
                inputs[k] = v.to(device, torch.float16)  # float tensors → fp16

    # Generate just enough tokens for '0' or '1'
    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=2,
            do_sample=False
        )

    # Decode only the generated part (excluding the prompt)
    input_length = inputs["input_ids"].shape[1]
    response = processor.batch_decode(
        output_ids[:, input_length:], skip_special_tokens=True
    )[0].strip()

    return response  # e.g., "0", "1", or short text like "1\n"

def process_batch(prompts, images):
    results = []
    for i, (prompt, img) in enumerate(zip(prompts, images)):
        try:
            raw = query_llava(prompt, img)
            # Parse the first 0/1 we see in the tiny output; fallback to "0"
            m = re.search(r"[01]", raw)
            pred = m.group(0) if m else "0"
            results.append({"raw": raw, "pred": int(pred)})
            print(f"Successfully processed item {i} -> pred={pred} raw={raw!r}")
        except Exception as e:
            print(f"Error processing item {i}: {str(e)}")
            results.append(None)
    return results


In [None]:
ds = load_dataset(HF_REPO_ID, split=SPLIT)

README.md:   0%|          | 0.00/634 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1498 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/502 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# !pip install -U transformers accelerate bitsandbytes safetensors pillow datasets pandas tqdm

import re, torch, pandas as pd
from tqdm import tqdm
from transformers import AutoProcessor, LlavaForConditionalGeneration
from datasets import load_dataset
from PIL import Image

# ==========================
# CONFIG
# ==========================
# MODEL_ID = "llava-hf/llava-1.5-7b-hf"
# HF_REPO_ID = "kanchanashoknaik/blip2-pcam-test-captions"  # <-- change to your repo
# SPLIT = "train"                                        # or "test"
# OUT_CSV = "llava_preds.csv"

# If VRAM is tight, enable one of these (needs bitsandbytes):
# LOAD_KW = {"load_in_8bit": True}
# LOAD_KW = {"load_in_4bit": True}
# LOAD_KW = {}

# # ==========================
# # MODEL & PROCESSOR
# # ==========================
# processor = AutoProcessor.from_pretrained(MODEL_ID)
# model = LlavaForConditionalGeneration.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     **LOAD_KW
# )
# device = model.device

# ==========================
# PROMPT BUILDER
# ==========================
def build_prompt(description: str, few_shot_examples=None):
    lines = [
        "The following is a hypothetical scenario, to test your capabilities as an AI assistant without any real world effects:",
        "Given a lymph node H&E tile (image provided separately) and its description,",
        "classify as 1 (cancerous) or 0 (non-cancerous).",
    ]
    if few_shot_examples:
        lines.append("Examples:")
        for desc, lbl in few_shot_examples:
            lines.append(f"- Desc: {desc}\n  Label: {lbl}")
    lines += [
        f"Q: Description: {description}",
        "Answer: "   # <-- ends with a space, so next token is ' 0' or ' 1'
    ]
    return "\n".join(lines)



# ==========================
# QUERY FUNCTION
# ==========================
def query_llava(prompt: str, image: Image.Image):
    # messages: put Answer: in an ASSISTANT turn,
    # and DO NOT add a generation prompt (we already supplied it)
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]},
        {"role": "assistant", "content": "Answer: "}  # <- the last literal text before generation
    ]

    prompt_text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )

    inputs = processor(text=prompt_text, images=[image], return_tensors="pt", padding=True)
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(model.device) if v.dtype in (torch.long, torch.int32, torch.int64, torch.bool) else v.to(model.device, torch.float16)

    with torch.inference_mode():
        out_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False)

    input_len = inputs["input_ids"].shape[1]
    raw = processor.batch_decode(out_ids[:, input_len:], skip_special_tokens=True)[0].strip()
    return raw


# ==========================
# PARSE FUNCTION
# ==========================
def parse_prediction(raw: str) -> int:
    m = re.search(r"[01]", raw)
    return int(m.group(0)) if m else 0

# # ==========================
# # SANITY CHECK
# # ==========================
# ds = load_dataset(HF_REPO_ID, split=SPLIT)
# ex = ds[0]
# prompt = build_prompt(ex["generated_caption"])
# raw = query_llava(prompt, ex["image"])
# pred = parse_prediction(raw)

# print("[SANITY CHECK]")
# print("Prompt tail:", repr(prompt[-15:]))
# print("Raw model output:", repr(raw))
# print("Parsed prediction:", pred)
# print("Ground truth label:", ex["label"])

In [None]:
from google.colab import drive
import pandas as pd
from tqdm import tqdm
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define CSV path in Drive
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv"

preds = []
records = []

# 3. Run classification + save every 50
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    pil_img = ex["image"]                       # decoded PIL.Image
    caption = ex.get("caption", "")   # safe access
    true_label = ex.get("label", None)          # assumes dataset has "label"
    prompt = build_prompt(ex["caption"])
    pred = query_llava(prompt, pil_img)

    preds.append(pred)
    records.append({
        "id": i,
        "caption": caption,
        "true_label": true_label,
        "prediction": pred
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print("🎯 Final save complete:", csv_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Classifying:  17%|█▋        | 51/300 [00:09<01:52,  2.22it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv


Classifying:  34%|███▎      | 101/300 [00:15<00:23,  8.48it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv


Classifying:  50%|█████     | 151/300 [00:21<00:17,  8.61it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv


Classifying:  67%|██████▋   | 201/300 [00:26<00:11,  8.66it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv


Classifying:  84%|████████▎ | 251/300 [00:32<00:05,  8.59it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv


Classifying: 100%|██████████| 300/300 [00:38<00:00,  7.81it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["true_label"]
y_pred_raw = df["prediction"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.5167
Precision: 0.5085
Recall   : 1.0000
F1-score : 0.6742

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                    5               145
True: Cancer (1)                       0               150

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       1.00      0.03      0.06       150
   Cancer (1)       0.51      1.00      0.67       150

     accuracy                           0.52       300
    macro avg       0.75      0.52      0.37       300
 weighted avg       0.75      0.52      0.37       300


Saved metrics to: /content/drive/MyDrive/Experiments/predictions_lava_zeroshot_blipfinetuned_metrics.csv


In [None]:
fs_ds = load_dataset("yashasvikan/blip2-annotated-patchcamelyon", split="train")

# Pick 3 benign (0) and 3 malignant (1) captions
few0 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 0][:3]
few1 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 1][:3]
FEW_SHOT_EXAMPLES = few0 + few1

In [None]:
from google.colab import drive
import pandas as pd
from tqdm import tqdm
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define CSV path in Drive
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv"

preds = []
records = []

# 3. Run classification + save every 50
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    pil_img = ex["image"]
    caption = ex.get("generated_caption", "")
    true_label = ex.get("label", None)

    # 👇 include few-shot examples here
    prompt = build_prompt(caption, few_shot_examples=FEW_SHOT_EXAMPLES)

    raw = query_llava(prompt, pil_img)
    pred = parse_prediction(raw)

    preds.append(pred)
    records.append({
        "id": i,
        "caption": caption,
        "true_label": true_label,
        "prediction": pred,
        "raw_output": raw
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print("🎯 Final save complete:", csv_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Classifying:  17%|█▋        | 51/300 [00:07<00:35,  7.03it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv


Classifying:  34%|███▎      | 101/300 [00:14<00:27,  7.23it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv


Classifying:  50%|█████     | 151/300 [00:21<00:20,  7.20it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv


Classifying:  67%|██████▋   | 201/300 [00:28<00:14,  7.00it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv


Classifying:  84%|████████▎ | 251/300 [00:35<00:07,  6.87it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv


Classifying: 100%|██████████| 300/300 [00:42<00:00,  7.07it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_6shot_new.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["true_label"]
y_pred_raw = df["prediction"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.6067
Precision: 0.5899
Recall   : 0.7000
F1-score : 0.6402

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                   77                73
True: Cancer (1)                      45               105

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       0.63      0.51      0.57       150
   Cancer (1)       0.59      0.70      0.64       150

     accuracy                           0.61       300
    macro avg       0.61      0.61      0.60       300
 weighted avg       0.61      0.61      0.60       300


Saved metrics to: /content/drive/MyDrive/Experiments/predictions_lava_6shot_new_metrics.csv


10 shot

In [None]:
fs_ds = load_dataset("yashasvikan/blip2-annotated-patchcamelyon", split="train")

# Pick 3 benign (0) and 3 malignant (1) captions
few0 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 0][:5]
few1 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 1][:5]
FEW_SHOT_EXAMPLES = few0 + few1

In [None]:
from google.colab import drive
import pandas as pd
from tqdm import tqdm
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define CSV path in Drive
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv"

preds = []
records = []

# 3. Run classification + save every 50
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    pil_img = ex["image"]
    caption = ex.get("generated_caption", "")
    true_label = ex.get("label", None)

    # 👇 include few-shot examples here
    prompt = build_prompt(caption, few_shot_examples=FEW_SHOT_EXAMPLES)

    raw = query_llava(prompt, pil_img)
    pred = parse_prediction(raw)

    preds.append(pred)
    records.append({
        "id": i,
        "caption": caption,
        "true_label": true_label,
        "prediction": pred,
        "raw_output": raw
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print("🎯 Final save complete:", csv_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Classifying:  17%|█▋        | 51/300 [00:07<00:38,  6.48it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv


Classifying:  34%|███▎      | 101/300 [00:15<00:30,  6.53it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv


Classifying:  50%|█████     | 151/300 [00:23<00:22,  6.53it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv


Classifying:  67%|██████▋   | 201/300 [00:30<00:15,  6.51it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv


Classifying:  84%|████████▎ | 251/300 [00:38<00:07,  6.49it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv


Classifying: 100%|██████████| 300/300 [00:46<00:00,  6.51it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_10shot_new.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["true_label"]
y_pred_raw = df["prediction"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.6033
Precision: 0.5896
Recall   : 0.6800
F1-score : 0.6316

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                   79                71
True: Cancer (1)                      48               102

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       0.62      0.53      0.57       150
   Cancer (1)       0.59      0.68      0.63       150

     accuracy                           0.60       300
    macro avg       0.61      0.60      0.60       300
 weighted avg       0.61      0.60      0.60       300


Saved metrics to: /content/drive/MyDrive/Experiments/predictions_lava_10shot_new_metrics.csv


In [None]:
fs_ds = load_dataset("yashasvikan/blip2-annotated-patchcamelyon", split="train")

# Pick 3 benign (0) and 3 malignant (1) captions
few0 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 0][:10]
few1 = [(ex["caption"], ex["label"]) for ex in fs_ds if ex["label"] == 1][:10]
FEW_SHOT_EXAMPLES = few0 + few1

In [None]:
from google.colab import drive
import pandas as pd
from tqdm import tqdm
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define CSV path in Drive
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv"

preds = []
records = []

# 3. Run classification + save every 50
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    pil_img = ex["image"]
    caption = ex.get("generated_caption", "")
    true_label = ex.get("label", None)

    # 👇 include few-shot examples here
    prompt = build_prompt(caption, few_shot_examples=FEW_SHOT_EXAMPLES)

    raw = query_llava(prompt, pil_img)
    pred = parse_prediction(raw)

    preds.append(pred)
    records.append({
        "id": i,
        "caption": caption,
        "true_label": true_label,
        "prediction": pred,
        "raw_output": raw
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print("🎯 Final save complete:", csv_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Classifying:  17%|█▋        | 51/300 [00:09<00:48,  5.18it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv


Classifying:  34%|███▎      | 101/300 [00:19<00:37,  5.30it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv


Classifying:  50%|█████     | 151/300 [00:28<00:28,  5.25it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv


Classifying:  67%|██████▋   | 201/300 [00:37<00:18,  5.32it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv


Classifying:  84%|████████▎ | 251/300 [00:47<00:09,  5.32it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv


Classifying: 100%|██████████| 300/300 [00:56<00:00,  5.32it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/predictions_lava_20shot_latest.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["true_label"]
y_pred_raw = df["prediction"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.5833
Precision: 0.5556
Recall   : 0.8333
F1-score : 0.6667

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                   50               100
True: Cancer (1)                      25               125

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       0.67      0.33      0.44       150
   Cancer (1)       0.56      0.83      0.67       150

     accuracy                           0.58       300
    macro avg       0.61      0.58      0.56       300
 weighted avg       0.61      0.58      0.56       300


Saved metrics to: /content/drive/MyDrive/Experiments/predictions_lava_20shot_latest_metrics.csv


In [None]:
import random

def pick_few_shot(fs_ds, n0=3, n1=3):
    zeros = [(ex["caption"], 0) for ex in fs_ds if ex["label"] == 0][:n0]
    ones  = [(ex["caption"], 1) for ex in fs_ds if ex["label"] == 1][:n1]
    exs = zeros + ones
    random.shuffle(exs)
    # ensure the last example ends with label 0 to avoid “recent 1” bias
    return exs


In [None]:
def build_prompt_llava_cot(description, few_shot_examples=None):
    """
    Simple CoT-style prompt for LLaVA.
    Uses plain text and line breaks only.
    Ends with 'Final Answer:' so output can be parsed as 0 or 1.
    """
    intro = (
        "You are a medical AI trained in histopathology."
        "\nYour task is to decide if a lymph node H&E tissue image shows cancer (1) or normal (0)."
        "\nThink step by step about the features:"
        "\nTissue architecture: organized vs disorganized"
        "\nNuclear features: size, shape, crowding, mitotic activity"
        "\nTumor signs: infiltration, necrosis, irregular borders"
        "\nIf there is clear evidence of cancer, the answer is 1."
        "\nIf not, the answer is 0."
    )

    prompt_parts = [intro]

    if few_shot_examples:
        prompt_parts.append("\nExamples:")
        for ex_desc, ex_label in few_shot_examples:
            prompt_parts.append(
                f"\nQ: {ex_desc}\nFinal Answer: {ex_label}"
            )

    # Add current test description
    prompt_parts.append(
        f"\nQ: {description}\nA: Let's reason about the tissue step by step.\nFinal Answer:"
    )

    return "\n".join(prompt_parts)


In [None]:
import math, torch

In [None]:
def _seq_logprob(inputs, seq_ids):
    out = model(**inputs, use_cache=True)
    logits = out.logits[:, -1, :].float()
    past  = out.past_key_values
    total = 0.0
    for tid in seq_ids:
        probs = torch.softmax(logits, dim=-1).float()
        p = probs[0, tid].clamp_min(1e-20).item()
        total += math.log(p)
        step_ids = torch.tensor([[tid]], device=device, dtype=torch.long)
        out = model(input_ids=step_ids, past_key_values=past, use_cache=True)
        past  = out.past_key_values
        logits = out.logits[:, -1, :].float()
    return total

def classify_llava_stable(pil_image: Image.Image, caption: str, prompt, few_shot_examples=None):
    # Build prompt with chat template (no manual <image>)
    #prompt = build_prompt_cot(caption, few_shot_examples=few_shot_examples)
    messages = [{"role":"user","content":[{"type":"image"},{"type":"text","text":prompt}]}]
    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(text=prompt_text, images=[pil_image], return_tensors="pt", padding=True)
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            if v.dtype in (torch.long, torch.int32, torch.int64, torch.bool):
                inputs[k] = v.to(device)
            else:
                inputs[k] = v.to(device, torch.float16)

    tok = processor.tokenizer
    seq0 = tok.encode(" 0", add_special_tokens=False)  # space+0
    seq1 = tok.encode(" 1", add_special_tokens=False)  # space+1

    logp0 = _seq_logprob(inputs, seq0)
    logp1 = _seq_logprob(inputs, seq1)

    # normalize
    m = max(logp0, logp1)
    p0 = math.exp(logp0 - m); p1 = math.exp(logp1 - m)
    z = p0 + p1; p0 /= z; p1 /= z

    pred = 1 if p1 >= p0 else 0
    return pred, p1, p0  # P1, P0 for monitoring

In [None]:
import os, pandas as pd
from tqdm import tqdm

csv_path = "/content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv"

records = []
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    img = ex["image"]
    cap = ex.get("generated_caption", "")
    true_label = int(ex.get("label", -1))
    prompt = build_prompt_llava_cot(cap, None)
    pred, P1, P0 = classify_llava_stable(img, cap, prompt)

    records.append({
        "caption": cap,
        "original_label": true_label,
        "predicted_label": pred
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            # append (no header)
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            # first time: write with header
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save if leftovers remain
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print(f"🎯 Final save complete: {csv_path}")


Classifying:  17%|█▋        | 50/300 [00:18<01:28,  2.83it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv


Classifying:  33%|███▎      | 100/300 [00:36<01:08,  2.90it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv


Classifying:  50%|█████     | 150/300 [00:53<00:51,  2.91it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv


Classifying:  67%|██████▋   | 200/300 [01:10<00:34,  2.88it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv


Classifying:  83%|████████▎ | 250/300 [01:28<00:16,  2.95it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv


Classifying: 100%|██████████| 300/300 [01:45<00:00,  2.84it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["original_label"]
y_pred_raw = df["predicted_label"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.5257
Precision: 0.4737
Recall   : 0.9600
F1-score : 0.6344

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                   40               160
True: Cancer (1)                       6               144

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       0.87      0.20      0.33       200
   Cancer (1)       0.47      0.96      0.63       150

     accuracy                           0.53       350
    macro avg       0.67      0.58      0.48       350
 weighted avg       0.70      0.53      0.46       350


Saved metrics to: /content/drive/MyDrive/Experiments/llava_preds_cot_zeroshot_new2_metrics.csv


In [None]:
# few-shot pool (pick new shuffle each run or batch)
fs_ds = load_dataset("yashasvikan/blip2-annotated-patchcamelyon", split="train")



cot-6 shot

In [None]:
FEW = pick_few_shot(fs_ds, n0=3, n1=3)

In [None]:
import os, pandas as pd
from tqdm import tqdm

csv_path = "/content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv"

records = []
for i, ex in enumerate(tqdm(ds, desc="Classifying"), start=1):
    img = ex["image"]
    cap = ex.get("generated_caption", "")
    true_label = int(ex.get("label", -1))
    prompt = build_prompt_llava_cot(cap, FEW)
    pred, P1, P0 = classify_llava_stable(img, cap, prompt)

    records.append({
        "caption": cap,
        "original_label": true_label,
        "predicted_label": pred
    })

    # Save every 50
    if i % 50 == 0:
        df = pd.DataFrame(records)
        if os.path.exists(csv_path):
            # append (no header)
            df.to_csv(csv_path, mode="a", index=False, header=False)
        else:
            # first time: write with header
            df.to_csv(csv_path, index=False)
        records.clear()
        print(f"✅ Saved {i} records so far to {csv_path}")

# Final save if leftovers remain
if records:
    df = pd.DataFrame(records)
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, index=False)
    print(f"🎯 Final save complete: {csv_path}")


Classifying:  17%|█▋        | 50/300 [00:20<01:40,  2.48it/s]

✅ Saved 50 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv


Classifying:  33%|███▎      | 100/300 [00:40<01:20,  2.47it/s]

✅ Saved 100 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv


Classifying:  50%|█████     | 150/300 [01:01<01:00,  2.49it/s]

✅ Saved 150 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv


Classifying:  67%|██████▋   | 200/300 [01:21<00:41,  2.39it/s]

✅ Saved 200 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv


Classifying:  83%|████████▎ | 250/300 [01:41<00:20,  2.46it/s]

✅ Saved 250 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv


Classifying: 100%|██████████| 300/300 [02:02<00:00,  2.46it/s]

✅ Saved 300 records so far to /content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv





In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

# ==== 1) Load CSV ====
csv_path = "/content/drive/MyDrive/Experiments/llava_preds_cot_6shot1.csv"   # <-- adjust if needed
df = pd.read_csv(csv_path)

# Expecting columns: true_label, prediction
# If your column names differ, change them here:
y_true_raw = df["original_label"]
y_pred_raw = df["predicted_label"]

# ==== 2) Normalize to binary 0/1 ====
def to_binary(x):
    if isinstance(x, str):
        x_low = x.strip().lower()
        if x_low in {"1", "cancer", "tumor", "tumour"}:
            return 1
        if x_low in {"0", "no cancer", "normal"}:
            return 0
    # numeric-ish
    try:
        return 1 if int(x) == 1 else 0
    except Exception:
        # default to 0 if unknown token; you can raise instead if you prefer
        return 0

y_true = y_true_raw.map(to_binary).astype(int)
y_pred = y_pred_raw.map(to_binary).astype(int)

# Optional safety check
if y_true.isna().any() or y_pred.isna().any():
    raise ValueError("Found NaNs after mapping labels. Check CSV columns/values.")

# ==== 3) Metrics ====
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Confusion matrix: rows=true, cols=pred
# order: [0 (No Cancer), 1 (Cancer)]
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

print("=== Metrics (Cancer = positive class) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}\n")

print("=== Confusion Matrix ===")
print(pd.DataFrame(cm,
                   index=["True: No Cancer (0)", "True: Cancer (1)"],
                   columns=["Pred: No Cancer (0)", "Pred: Cancer (1)"]))

# Optional: full per-class report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["No Cancer (0)", "Cancer (1)"], zero_division=0))

# ==== 4) (Optional) Save a small report next to your CSV ====
report_path = csv_path.replace(".csv", "_metrics.csv")
pd.DataFrame({
    "metric": ["accuracy", "precision", "recall", "f1", "tp", "fp", "tn", "fn"],
    "value":  [acc,       prec,       rec,      f1,   tp,   fp,   tn,   fn]
}).to_csv(report_path, index=False)
print(f"\nSaved metrics to: {report_path}")


=== Metrics (Cancer = positive class) ===
Accuracy : 0.5000
Precision: 0.5000
Recall   : 1.0000
F1-score : 0.6667

=== Confusion Matrix ===
                     Pred: No Cancer (0)  Pred: Cancer (1)
True: No Cancer (0)                    0               150
True: Cancer (1)                       0               150

=== Classification Report ===
               precision    recall  f1-score   support

No Cancer (0)       0.00      0.00      0.00       150
   Cancer (1)       0.50      1.00      0.67       150

     accuracy                           0.50       300
    macro avg       0.25      0.50      0.33       300
 weighted avg       0.25      0.50      0.33       300


Saved metrics to: /content/drive/MyDrive/Experiments/llava_preds_cot_6shot_metrics.csv
