In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
#

Mounted at /content/drive/


In [None]:
import torch, pandas as pd
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,
    TrainingArguments,
    Trainer,
)
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
)
from torch.utils.data import random_split


# ─── Config ─────────────────────────────────────────────────────────────
MODEL_NAME = "Salesforce/blip2-flan-t5-xxl"
DATA_PATH = "/content/drive/MyDrive/patches_captions/embedded_histopath_dataset_clip_l14.pkl"
MAX_LENGTH = 512
DEVICE_BATCH = 2

In [None]:
def prepare_samples(df, processor):
    processor.tokenizer.truncation_side = "left"
    samples = []

    for _, row in df.iterrows():
        enc = processor(
            row["pil_image"],
            "[HISTOPATH IMAGE]\nTask: Describe diagnostic features\nAnswer:",
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True,
        )

        pixel_values = enc.pixel_values.squeeze(0).to(torch.float32)

        labels = processor.tokenizer(
            row["combined_text"],
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True,
        ).input_ids
        labels[labels == processor.tokenizer.pad_token_id] = -100

        samples.append(
            dict(
                pixel_values=pixel_values,
                input_ids=enc.input_ids.squeeze(0),
                attention_mask=enc.attention_mask.squeeze(0),
                labels=labels.squeeze(0),
            )
        )
    return samples

In [None]:
processor = Blip2Processor.from_pretrained(MODEL_NAME)
#processor.tokenizer.pad_token = processor.tokenizer.eos_token


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
class CustomDataCollator:
    def __call__(self, features):
        batch = {
            "pixel_values": torch.stack([f["pixel_values"] for f in features]).to(torch.float32),
            "input_ids": torch.stack([f["input_ids"] for f in features]),
            "attention_mask": torch.stack([f["attention_mask"] for f in features]),
            "labels": torch.stack([f["labels"] for f in features]),
        }
        return batch

In [None]:
df           = pd.read_pickle(DATA_PATH)
all_samples  = prepare_samples(df, processor)

train_data, val_data = random_split(
    all_samples,
    [0.9, 0.1],
    generator=torch.Generator().manual_seed(42),
)
data_collator = CustomDataCollator()


In [None]:
from peft import TaskType

In [None]:
from transformers import BitsAndBytesConfig


In [None]:
# ─── 2.  Load BLIP-2 in bf16 -----------------------------------
class Blip2Safe(Blip2ForConditionalGeneration):
    def forward(self, pixel_values=None, *args, **kwargs):
        if pixel_values is not None and pixel_values.dtype != torch.float32:
            pixel_values = pixel_values.to(torch.float32)
        kwargs.pop("inputs_embeds", None)
        kwargs.pop("decoder_inputs_embeds", None)
        kwargs.pop("num_items_in_batch", None)
        return super().forward(pixel_values=pixel_values, *args, **kwargs)

bnb8 = BitsAndBytesConfig(load_in_8bit=True)

model = Blip2Safe.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb8,   # ← 8-bit, outputs are fp16
    device_map={"": 0},         # put *all* layers on GPU-0 directly
    torch_dtype=torch.float16,  # 8-bit matmuls still run in fp16
)

# ─── 3.  Attach LoRA (still light-weight!) ----------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/9.18G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

trainable params: 1,449,984 || all params: 12,231,046,656 || trainable%: 0.0119


In [None]:
training_args = TrainingArguments(
    output_dir="./blip2_finetune_results",
    num_train_epochs=3,
    per_device_train_batch_size=1,    # fp16 needs more RAM → reduce bs
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=32,   # keeps effective batch the same
    learning_rate=3e-4,
    fp16=False,                       # we’re in bf16
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    warmup_ratio=0.05,
    weight_decay=0.01,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=Blip2Processor.from_pretrained(MODEL_NAME).tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,78.4542,26.359375
2,77.4223,26.359375


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=249, training_loss=78.1476609563253, metrics={'train_runtime': 17296.4296, 'train_samples_per_second': 0.463, 'train_steps_per_second': 0.014, 'total_flos': 8.672328023251983e+19, 'train_loss': 78.1476609563253, 'epoch': 2.970059880239521})

In [None]:
model.save_pretrained("/content/drive/MyDrive/fine_tuned_blip5x_captions")

In [None]:
# Check first batch
sample_batch = next(iter(DataLoader(train_data, batch_size=2, collate_fn=collate)))
print(f"Pixel dtype: {sample_batch['pixel_values'].dtype}")  # Should be torch.float32
print(f"Labels dtype: {sample_batch['labels'].dtype}")  # Should be torch.int64


Pixel dtype: torch.float32
Labels dtype: torch.int64


### Generating captions from fine tunned model

In [None]:
from transformers import (
    BitsAndBytesConfig, Blip2Processor, Blip2ForConditionalGeneration
)
from peft import PeftModel
import torch, csv, itertools
from datasets import load_dataset

In [None]:
# ─── helper wrapper you used during training ───────────────────
class Blip2Safe(Blip2ForConditionalGeneration):
    def forward(self, pixel_values=None, *args, **kwargs):
        if pixel_values is not None and pixel_values.dtype != torch.float32:
            pixel_values = pixel_values.to(torch.float32)
        kwargs.pop("inputs_embeds", None)
        kwargs.pop("decoder_inputs_embeds", None)
        kwargs.pop("num_items_in_batch", None)
        return super().forward(pixel_values=pixel_values, *args, **kwargs)

from transformers import BitsAndBytesConfig

bnb8 = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float32   # ← computations in fp32
)

base_model = Blip2Safe.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    quantization_config=bnb8,
    device_map={"": 0},
    torch_dtype=torch.float32              # ← embeddings are fp32
)

model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/fine_tuned_blip5x_captions" ).eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
ckpt_dir = "/content/drive/MyDrive/fine_tuned_blip5x_captions"  # the folder you saved

model = PeftModel.from_pretrained(base_model, ckpt_dir)
model.eval()                         # inference mode (optional)




PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): Blip2Safe(
      (vision_model): Blip2VisionModel(
        (embeddings): Blip2VisionEmbeddings(
          (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
        )
        (encoder): Blip2Encoder(
          (layers): ModuleList(
            (0-38): 39 x Blip2EncoderLayer(
              (self_attn): Blip2Attention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear8bitLt(in_features=1408, out_features=4224, bias=True)
                (projection): Linear8bitLt(in_features=1408, out_features=1408, bias=True)
              )
              (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
              (mlp): Blip2MLP(
                (activation_fn): GELUActivation()
                (fc1): Linear8bitLt(in_features=1408, out_features=6144, bias=True)
                (fc2): Linear8bitLt(in_features=6144, out_features=1408, bias=True)
              

In [None]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
prompt = "[HISTOPATH IMAGE]\nTask: Describe diagnostic features\nAnswer:"

In [None]:
stream = load_dataset(
    "1aurent/PatchCamelyon", split="train", streaming=True
)  # :contentReference[oaicite:0]{index=0}

# optional shuffle with buffer
stream = stream.shuffle(buffer_size=10_000, seed=42)


In [None]:
# ─── 0.  Imports ───────────────────────────────────────────────
import os, csv, itertools, torch, contextlib
from datasets import load_dataset
from PIL import Image
from tqdm.auto import tqdm

# ─── 1.  Global settings ───────────────────────────────────────
prompt = (
    "[HISTOPATH IMAGE]\n"
    "Task: Describe diagnostic features (cell type, stain, tumour/normal).\n"
    "Answer:"
)
root_out   = "/content/drive/MyDrive/patchcamelyon_captions"
img_dir    = os.path.join(root_out, "images")
csv_path   = os.path.join(root_out, "captions.csv")
batch_size = 8
resume     = True                   # → continue from previous run if True

os.makedirs(img_dir, exist_ok=True)

# ─── 2.  Prepare streaming dataset ─────────────────────────────
stream = load_dataset(
    "1aurent/PatchCamelyon", split="train", streaming=True
).shuffle(seed=42)

def batched(it, n):
    it = iter(it)
    while (batch := list(itertools.islice(it, n))):
        yield batch

# ─── 3.  Determine starting row_id if resuming ─────────────────
row_id = 0
if resume and os.path.isfile(csv_path):
    with open(csv_path, newline="") as f:
        row_id = sum(1 for _ in f) - 1          # header row not counted
    print(f"Resuming at row_id {row_id:,}")

mode = "a" if resume and row_id > 0 else "w"

# ─── 4.  Caption + save loop ───────────────────────────────────
with open(csv_path, mode, newline="") as fh, contextlib.suppress(KeyboardInterrupt), \
     torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):

    writer = csv.writer(fh)
    if mode == "w":
        writer.writerow(["file_name", "label", "caption"])

    progress = tqdm(batched(stream, batch_size), desc="Captioning patches")

    for batch in progress:
        # skip rows we already processed when resuming
        if resume and row_id >= len(batch):     # fast path; unlikely
            row_id -= len(batch)
            continue

        images = [ex["image"] for ex in batch]
        inputs = processor(
            images, [prompt] * len(images),
            return_tensors="pt", padding=True
        ).to(model.device)

        gen_ids  = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=True,
            temperature=0.9,
            top_p=0.9,
            repetition_penalty=1.15,
        )
        captions = processor.batch_decode(gen_ids, skip_special_tokens=True)

        for ex, cap in zip(batch, captions):
            fname = f"row_{row_id:06d}.png"
            path  = os.path.join(img_dir, fname)

            ex["image"].convert("RGB").save(path, format="PNG")
            writer.writerow([fname, ex["label"], cap])

            progress.set_postfix_str(cap[:80])  # show caption snippet
            row_id += 1

print("✓ images in", img_dir)
print("✓ CSV    in", csv_path)


Resuming at row_id 24


  torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):


Captioning patches: 0it [00:00, ?it/s]

✓ images in /content/drive/MyDrive/patchcamelyon_captions/images
✓ CSV    in /content/drive/MyDrive/patchcamelyon_captions/captions.csv


GPT 2 fine tunning

In [None]:
import pandas as pd, torch

path = "/content/drive/MyDrive/patches_captions/embedded_histopath_dataset_clip_l14.pkl"
df   = pd.read_pickle(path)         # inspect a row
print(df.iloc[0])


file_name                f77a619f6ccc449f915b44bdc0d6d8f0_e02bfe.jpg
pil_image          <PIL.Image.Image image mode=RGB size=300x300 a...
combined_text      Well differentiated tubular adenocarcinoma - L...
image_embedding    [0.06492177, 0.24751252, -0.0429765, 0.1204618...
text_embedding     [0.07974413, -0.013286168, 0.019073578, -0.035...
Name: 0, dtype: object


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
import numpy as np
import torch.nn as nn
from transformers import GPT2LMHeadModel, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token     # GPT-2 has no pad by default

In [None]:
class ClipCaptionDataset(Dataset):
    def __init__(self, df, prefix_length=10):
        self.df = df
        self.prefix_length = prefix_length

    def __len__(self):  return len(self.df)

    def __getitem__(self, idx):
        row  = self.df.iloc[idx]
        clip = torch.tensor(row["image_embedding"], dtype=torch.float32)  # (768,)
        text = row["combined_text"]
        tokens = tokenizer(
            text, return_tensors="pt", padding="max_length",
            max_length=64, truncation=True
        )
        tokens["labels"] = tokens.input_ids.clone()
        return {"clip": clip, **{k: v.squeeze(0) for k, v in tokens.items()}}

def collate(batch):
    return {
        "clip": torch.stack([b["clip"] for b in batch]),
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "labels": torch.stack([b["labels"] for b in batch]),
    }

In [None]:
prefix_length   = 100          # how many “virtual” tokens your image becomes
clip_dim        = 768         # 1024 if ViT-L/14@336
gpt_embed_dim   = 768         # GPT-2 small/medium

# 3-a  load GPT-2 in 8-bit
bnb8 = BitsAndBytesConfig(load_in_8bit=True)
gpt2 = GPT2LMHeadModel.from_pretrained(
    "gpt2", quantization_config=bnb8,
    device_map="auto", torch_dtype=torch.float16
)

# 3-b  add LoRA to GPT-2’s projections
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["c_attn", "c_proj"]      # key, query, value & output proj
)
gpt2 = get_peft_model(prepare_model_for_kbit_training(gpt2), lora_cfg)
gpt2.eval()

PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=768, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddi

In [None]:
class Clip2Prefix(nn.Module):
    def __init__(self, clip_dim, embed_dim, prefix_len):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(clip_dim, embed_dim * prefix_len // 2),
            nn.Tanh(),
            nn.Linear(embed_dim * prefix_len // 2, embed_dim * prefix_len),
        )
        self.prefix_len = prefix_len
        self.embed_dim  = embed_dim

    def forward(self, clip):
        x = self.fc(clip).view(-1, self.prefix_len, self.embed_dim)
        return x

mapper = Clip2Prefix(clip_dim, gpt_embed_dim, prefix_length)

In [None]:
class ClipCaptionModel(nn.Module):
    def __init__(self, gpt2, mapper, prefix_length):
        super().__init__()
        self.gpt2   = gpt2      # with LoRA
        self.mapper = mapper
        self.prefix_len = prefix_length

    def forward(self, clip, input_ids, attention_mask, labels):
        # map clip → prefix embeddings
        prefix_embed = self.mapper(clip).to(self.gpt2.device)  # (B, Lp, E)
        # project GPT-2 token IDs → embeddings
        tok_embed = self.gpt2.transformer.wte(input_ids)
        # concat prefix + text
        inputs_embeds = torch.cat([prefix_embed, tok_embed], dim=1)

        # extend masks/labels
        prefix_mask  = torch.ones(input_ids.size(0), self.prefix_len,
                                  dtype=attention_mask.dtype, device=attention_mask.device)
        attn_mask    = torch.cat([prefix_mask, attention_mask], dim=1)
        labels_full  = torch.cat([
            torch.full((labels.size(0), self.prefix_len),
                       -100, dtype=labels.dtype, device=labels.device),
            labels
        ], dim=1)

        return self.gpt2(
            inputs_embeds=inputs_embeds,
            attention_mask=attn_mask,
            labels=labels_full,
        )

model = ClipCaptionModel(gpt2, mapper, prefix_length).to("cuda")


In [None]:
model = model.to("cuda")

In [None]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import random_split

full_ds = ClipCaptionDataset(df, prefix_length)
train_ds, val_ds = random_split(
    full_ds, [0.9, 0.1], generator=torch.Generator().manual_seed(42)
)

args = TrainingArguments(
    output_dir="./gpt2_clip_lora",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    bf16=True,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_safetensors=False,       # ← fixes the crash
    load_best_model_at_end=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate,
)
trainer.train()

# after training finishes, save your adapter + mapper:
trainer.save_model("/content/drive/MyDrive/gpt2_clip_histopath_lora")
torch.save(model.mapper.state_dict(),
           "/content/drive/MyDrive/gpt2_clip_histopath_lora/mapper.pt")


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.9887,0.390298
2,0.3804,0.262984
3,0.3016,0.222879
4,0.2662,0.20129
5,0.2438,0.193803


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/gpt2_clip_histopath_lora")

('/content/drive/MyDrive/gpt2_clip_histopath_lora/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora/vocab.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora/merges.txt',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora/added_tokens.json')

# Caption generation from GPT2 finetuned model

In [None]:
import os, json, torch
from transformers import GPT2Tokenizer

OUT_DIR = "/content/drive/MyDrive/gpt2_clip_histopath_lora_new"
os.makedirs(OUT_DIR, exist_ok=True)

# 1) Save the PEFT adapter inside your wrapper
#    model.gpt2 is the PeftModel / GPT2LMHeadModel with LoRA
model.gpt2.save_pretrained(OUT_DIR, safe_serialization=False)

# 2) Save the mapper MLP weights
torch.save(model.mapper.state_dict(), os.path.join(OUT_DIR, "mapper.pt"))

# 3) Save any hyperparameters your wrapper needs (e.g. prefix_length)
with open(os.path.join(OUT_DIR, "meta.json"), "w") as f:
    json.dump({"prefix_length": model.prefix_len}, f)

# 4) Save the tokenizer (if you modified pad_token or vocab)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(OUT_DIR)


('/content/drive/MyDrive/gpt2_clip_histopath_lora_new/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora_new/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora_new/vocab.json',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora_new/merges.txt',
 '/content/drive/MyDrive/gpt2_clip_histopath_lora_new/added_tokens.json')

In [None]:
import os
import json
import torch
import torch.nn as nn
import pandas as pd
from itertools import islice
from datasets import load_dataset
from PIL import Image
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    BitsAndBytesConfig,
    CLIPProcessor,
    CLIPModel,
)
from peft import PeftModel

# ─── 0.  Configuration ─────────────────────────────────────────
OUT_DIR     = "/content/drive/MyDrive/gpt2_clip_histopath_lora_new"
DEVICE      = "cuda"
PREFIX_LEN  = 10
CLIP_DIM    = 768
EMBED_DIM   = 768
NUM_SAMPLES = 50000    # set to None to process the whole split

# ─── 1.  Reload tokenizer (must have been saved after training) ─
tokenizer = GPT2Tokenizer.from_pretrained(OUT_DIR)
tokenizer.pad_token = tokenizer.eos_token

# ─── 2.  Load fine-tuned GPT-2 + LoRA adapter in 8-bit ─────────
bnb8 = BitsAndBytesConfig(load_in_8bit=True)
base_gpt2 = GPT2LMHeadModel.from_pretrained(
    "gpt2",
    quantization_config=bnb8,
    device_map="auto",
    torch_dtype=torch.float16,
)
gpt2_lora = (
    PeftModel.from_pretrained(base_gpt2, OUT_DIR)
    .to(DEVICE)
    .eval()
)

# ─── 3.  Rebuild & load the mapper MLP ────────────────────────
class Clip2Prefix(nn.Module):
    def __init__(self, clip_dim, embed_dim, prefix_len):
        super().__init__()
        hidden = embed_dim * prefix_len // 2
        self.fc = nn.Sequential(
            nn.Linear(clip_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, embed_dim * prefix_len),
        )
        self.prefix_len = prefix_len
        self.embed_dim  = embed_dim

    def forward(self, clip):
        return self.fc(clip).view(-1, self.prefix_len, self.embed_dim)

mapper = Clip2Prefix(CLIP_DIM, EMBED_DIM, PREFIX_LEN).to(DEVICE)
mapper.load_state_dict(
    torch.load(os.path.join(OUT_DIR, "mapper.pt"), map_location=DEVICE)
)
mapper.eval()

# ─── 4.  Define inference wrapper with correct dtype & mask ────
class ClipCaptionModel(nn.Module):
    def __init__(self, gpt2: PeftModel, mapper: nn.Module, prefix_len: int, pad_token_id: int):
        super().__init__()
        self.gpt2        = gpt2
        self.mapper      = mapper
        self.prefix_len  = prefix_len
        self.pad_token_id = pad_token_id

    @torch.no_grad()
    def generate_caption(self, clip_emb: torch.Tensor, **gen_kwargs):
        # clip_emb: (1, CLIP_DIM) float32
        # 1) map to prefix embeddings, cast to gpt2 dtype
        prefix = self.mapper(clip_emb.to(self.gpt2.device))
        prefix = prefix.to(self.gpt2.dtype)  # e.g. float16

        # 2) build attention mask for the prefix
        batch_size = prefix.size(0)
        prefix_mask = torch.ones(
            batch_size, self.prefix_len,
            dtype=torch.long,
            device=self.gpt2.device
        )

        # 3) generate with mask and pad_token_id
        return self.gpt2.generate(
            inputs_embeds=prefix,
            attention_mask=prefix_mask,
            pad_token_id=self.pad_token_id,
            **gen_kwargs
        )

caption_model = ClipCaptionModel(
    gpt2_lora,
    mapper,
    prefix_len=PREFIX_LEN,
    pad_token_id=tokenizer.pad_token_id
)

# ─── 5.  Load CLIP model & processor ───────────────────────────
clip_model     = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")\
                        .to(DEVICE).eval()
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# ─── 6.  Load PCam dataset ─────────────────────────────────────
ds = load_dataset("1aurent/PatchCamelyon", split="train", streaming=False)

# ─── 7.  Inference loop ────────────────────────────────────────
results = []
for ex in islice(ds, NUM_SAMPLES):
    img, label = ex["image"], ex["label"]

    # a) CLIP embed
    clip_inputs = clip_processor(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        emb = clip_model.get_image_features(**clip_inputs)
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)

    # b) Generate caption
    gen_ids = caption_model.generate_caption(
        emb,
        max_new_tokens=64,
        do_sample=True,
        temperature=0.9,
        top_p=0.9,
    )
    caption = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

    # c) Print & store
    print(f"Label={label} → {caption}")
    results.append({"image": img, "label": label, "caption": caption})

# ─── 8.  Build DataFrame ────────────────────────────────────────
df = pd.DataFrame(results)
print(df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Label=True →  cell adenocarcinoma - In the superficial epithelium, tumor tissue that invades by forming medium-sized to small, irregular ducts is observed. Tumor cells are highly columnar, with nuclei aligned basolaterally and polarized. Large tumor cells show diffusely fused papillary duct
Label=True →  differentiated adenocarcinoma - In the superficial epithelium, tumor tissue that invades by forming medium-sized to small, irregular ducts is observed. Tumor cells are highly columnar, with nuclei aligned basolaterally and polarized.
Label=True →  Tumor cells are large, highly columnar, large club-shaped nuclei, and are associated with chromatin aggregation. Tumor cells exhibit dense, oligosapotropic nuclei, and are associated with chromatin aggregation.
Label=True → et adenocarcinoma - Tumor tissue consisting of medium-sized and irregular glandular ducts fused and infiltrated is observed in the superficial epithelium. Tu

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)
print(df.head())

                                               image  label  \
0  <PIL.PngImagePlugin.PngImageFile image mode=RG...  False   
1  <PIL.PngImagePlugin.PngImageFile image mode=RG...   True   
2  <PIL.PngImagePlugin.PngImageFile image mode=RG...   True   
3  <PIL.PngImagePlugin.PngImageFile image mode=RG...   True   
4  <PIL.PngImagePlugin.PngImageFile image mode=RG...  False   

                                             caption  
0  et adenocarcinoma - In the superficial epithel...  
1   cell growth in which a large sheet-like nucle...  
2  atinoma - On the superficial epithelium, tumor...  
3  atinoma - Tumor cells are large, highly column...  
4  enocarcinoma - Tumor cells are large, highly c...  


In [None]:
df.head()

Unnamed: 0,image,label,caption
0,<PIL.PngImagePlugin.PngImageFile image mode=RG...,False,et adenocarcinoma - In the superficial epithel...
1,<PIL.PngImagePlugin.PngImageFile image mode=RG...,True,cell growth in which a large sheet-like nucle...
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,True,"atinoma - On the superficial epithelium, tumor..."
3,<PIL.PngImagePlugin.PngImageFile image mode=RG...,True,"atinoma - Tumor cells are large, highly column..."
4,<PIL.PngImagePlugin.PngImageFile image mode=RG...,False,"enocarcinoma - Tumor cells are large, highly c..."


In [None]:
len(df)
# or
df.shape[0]


14311

In [None]:
df.to_pickle("/content/drive/MyDrive/patchcamelyon_captions_df_final_final_new_new.pkl")

Emmbeddings

In [None]:
!pip install -qU sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/345.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, torch
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel

In [None]:
DEVICE   = "cuda"
IN_PATH  = "/content/drive/MyDrive/patchcamelyon_captions_df_final_final.pkl"
OUT_PATH = "/content/drive/MyDrive/patchcamelyon_with_embs.pkl"
BATCH    = 32

In [None]:
df = pd.read_pickle(IN_PATH)

In [None]:
print(len(df))


500


In [None]:
clip_model     = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")\
                         .to(DEVICE).eval()
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def batch(iterable, n):
    it = iter(iterable)
    while chunk := list(torch.utils.data.IterableDataset.wrap_iter(it, n)):
        yield chunk

In [None]:
all_img_embs = []
for i in tqdm(range(0, len(df), BATCH), desc="Image → CLIP"):
    imgs = df["image"].iloc[i : i + BATCH].tolist()
    clip_in = clip_processor(images=imgs, return_tensors="pt", padding=True)
    clip_in = {k: v.to(DEVICE) for k, v in clip_in.items()}
    with torch.no_grad():
        emb = clip_model.get_image_features(**clip_in)
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    all_img_embs.append(emb.cpu())
all_img_embs = torch.cat(all_img_embs, dim=0).numpy()

Image → CLIP:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
text_model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
texts      = df["caption"].tolist()
all_txt_embs = text_model.encode(
    texts,
    batch_size=BATCH,
    show_progress_bar=True,
    convert_to_numpy=True,
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Lava fine tune

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image

from transformers import (
    LlavaForConditionalGeneration,
    LlavaProcessor,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)

In [None]:
HF_TOKEN = "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"
OUT_DIR  = "llava-pcam-finetuned"
DEVICE   = "cuda"

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/patchcamelyon_captions_df_final_final_new_new.pkl")

In [None]:
df['label'] = df['label'].astype(int)

In [None]:
df_pos = df[df['label'] == 1].sample(n=2500, random_state=42)
df_neg = df[df['label'] == 0].sample(n=2500, random_state=42)

# 3) Combine and shuffle
df_subset = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
)

base_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN,
)

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
model = prepare_model_for_kbit_training(base_model)
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

# ─── 4. Freeze non-LoRA & enable gradient checkpointing ─────────
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

model.gradient_checkpointing_enable()

In [None]:
processor = LlavaProcessor.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    token=HF_TOKEN
)
tokenizer = processor.tokenizer

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

In [None]:
class PCamCaptionDataset(Dataset):
    def __init__(self, df, processor, tokenizer, max_length=512):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row     = self.df.iloc[idx]
        image   = row["image"]
        caption = row["caption"]
        label   = int(row["label"])

        prompt = (
            "<image>\n"
            "### Instruction:\n"
            "Given the following image caption, output the correct class (0 or 1).\n\n"
            f"Caption: {caption}\n\n"
            "### Response:"
        )

        enc = self.processor(
           images=image,
           text=prompt,
           return_tensors="pt",
           padding=False,
           truncation=False,
       )
        input_ids      = enc.input_ids.squeeze(0)
        attention_mask = enc.attention_mask.squeeze(0)
        pixel_values   = enc.pixel_values.squeeze(0)

        # ─────────── FIX: encode label as exactly one token ───────────
        # This ensures we only ever supervise one position, with no pad/eos.
        all_ids = self.tokenizer.encode(str(label), add_special_tokens=False)
        label_token_id = all_ids[-1]                      # ← Single digit token

        # ─── FIX: mask everything except that one final position ──────
        labels = torch.full_like(input_ids, -100)         # ignore by default
        labels[-1] = label_token_id                       # supervise only last

        return {
            "input_ids":      input_ids,
            "attention_mask": attention_mask,
            "pixel_values":   pixel_values,
            "labels":         labels,    # now exactly one non‐ignored token
        }


In [None]:
train_dataset = PCamCaptionDataset(df_subset, processor, tokenizer)

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=collate_fn,
    num_workers=8,
    pin_memory=True,
    prefetch_factor=2,
    persistent_workers=True,
)

In [None]:
batch = next(iter(train_dataloader))
print("labels unique:", torch.unique(batch["labels"]))
print("non‐ignored count:", (batch["labels"] != -100).sum())


labels unique: tensor([ -100, 29896, 29900])
non‐ignored count: tensor(16)


In [None]:
# right after you load your processor/tokenizer:
print("‘0’ →", tokenizer.tokenize("0"), tokenizer("0", add_special_tokens=False).input_ids)
print("‘ 0’→", tokenizer.tokenize(" 0"), tokenizer(" 0", add_special_tokens=False).input_ids)
print("‘1’ →", tokenizer.tokenize("1"), tokenizer("1", add_special_tokens=False).input_ids)
print("‘ 1’→", tokenizer.tokenize(" 1"), tokenizer(" 1", add_special_tokens=False).input_ids)


‘0’ → ['▁', '0'] [29871, 29900]
‘ 0’→ ['▁▁', '0'] [259, 29900]
‘1’ → ['▁', '1'] [29871, 29896]
‘ 1’→ ['▁▁', '1'] [259, 29896]


In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # unpack lists
    all_input_ids      = [b["input_ids"]      for b in batch]
    all_attention     = [b["attention_mask"] for b in batch]
    all_pixel_values  = [b["pixel_values"]   for b in batch]
    all_labels        = [b["labels"]         for b in batch]

    # 1) Pad the token sequences to the max length in this batch
    input_ids      = pad_sequence(all_input_ids,     batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(all_attention,     batch_first=True, padding_value=0)
    labels         = pad_sequence(all_labels,        batch_first=True, padding_value=-100)
    # 2) Stack the image tensors (they’re all the same shape)
    pixel_values   = torch.stack(all_pixel_values)

    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "pixel_values":   pixel_values,
        "labels":         labels,
    }


In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=20,
    save_strategy="steps",
    save_steps=200,
    report_to=[],
)

# ─── 9. DataLoader with faster settings ────────────────────────
# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=training_args.per_device_train_batch_size,
#     collate_fn=collate_fn,
#     num_workers=8,
#     pin_memory=True,
#     prefetch_factor=2,
#     persistent_workers=True,
# )

# ─── 10. Custom Trainer to use our DataLoader ─────────────────
class MyTrainer(Trainer):
    def get_train_dataloader(self):
        return train_dataloader

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=None,       # we override the loader
    data_collator=collate_fn, # still used internally
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,4.0428
40,0.6726
60,0.69
80,0.6237
100,0.6345
120,0.6323
140,0.5401
160,0.636
180,0.5822
200,0.4723


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TrainOutput(global_step=939, training_loss=0.506467804487267, metrics={'train_runtime': 18102.6982, 'train_samples_per_second': 0.829, 'train_steps_per_second': 0.052, 'total_flos': 4.328751500547195e+17, 'train_loss': 0.506467804487267, 'epoch': 3.0})

In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"


In [None]:
model.save_pretrained("/content/drive/MyDrive/llava-pcam-finetuned_patchcam_new")

In [None]:
processor.save_pretrained("/content/drive/MyDrive/llava-pcam-finetuned_patchcam_new")

Test data preparation

In [None]:
HF_TOKEN = "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"

In [None]:
import pickle
import pandas as pd
from PIL import Image
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
PICKLE_IN = "/content/drive/MyDrive/patchcamelyon_embeddings_testdata500.pkl"
with open(PICKLE_IN, "rb") as f:
    test_data = pickle.load(f)

In [None]:
test_images   = test_data["images"]              # list of PIL.Image
test_captions = test_data["captions"]            # list of str
test_labels   = test_data["labels"]              # list of int
test_img_embs = np.array(test_data["image_embeddings"], dtype=np.float32)
test_txt_embs = np.array(test_data["text_embeddings"], dtype=np.float32)

In [None]:
import torch
import numpy as np
from transformers import BitsAndBytesConfig, LlavaForConditionalGeneration, LlavaProcessor
from peft import PeftModel

In [None]:
OUT_DIR     = "/content/drive/MyDrive/llava-pcam-finetuned_patchcam_new_correctloss"  # <-- change to wherever you saved your checkpoint
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
)

# ─── 2) Load base + PEFT model ───────────────────────────────────
base_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    token=HF_TOKEN,
)
model = PeftModel.from_pretrained(
    base_model,
    OUT_DIR,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlavaForConditionalGeneration(
      (vision_tower): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
          (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
          )
          (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder): CLIPEncoder(
            (layers): ModuleList(
              (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPSdpaAttention(
                  (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
                  (v_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
          

In [None]:
from transformers import LlavaProcessor

# Path where you saved it
PROC_DIR = "/content/drive/MyDrive/llava-pcam-finetuned_patchcam_new_correctloss"

# Reload the processor (and its tokenizer)
processor = LlavaProcessor.from_pretrained(PROC_DIR)

# Extract the tokenizer if you need it separately
tokenizer = processor.tokenizer


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
import re

preds = []
for img, cap in zip(test_images, test_captions):
    prompt = (
        "<image>\n"
        "### Instruction:\n"
        "Given the following image caption, output the correct class (0 or 1).\n\n"
        f"Caption: {cap}\n\n"
        "### Response:"
    )

    inputs = processor(
        images=img,
        text=prompt,
        return_tensors="pt",
        padding=True,
    ).to(DEVICE)

    with torch.no_grad():
        out_ids = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            pixel_values=inputs.pixel_values,
            max_new_tokens=1,
            do_sample=False,
        )

    pred_str = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()
    print("pred string start", pred_str)
    print("pred string end", pred_str)

    # extract the 0 or 1 after "### Response:"
    m = re.search(r"### Response:\s*([01])", pred_str)
    if m:
        label = int(m.group(1))
    else:
        # fallback: split on colon and take last token
        try:
            label = int(pred_str.split("### Response:")[-1].strip().split()[0])
        except Exception:
            label = -1

    preds.append(label)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

### Response:
pred string end ### Instruction:
Given the following image caption, output the correct class (0 or 1).

Caption: etoid infiltrating tumor tissue showing a large sheet-like shape and growing solidified. Well differentiated tubular adenocarcinoma

### Response:
pred string start ### Instruction:
Given the following image caption, output the correct class (0 or 1).

Caption: et al, solid type - The superficial epithelium shows a large sheet-like shape, and tumor tissue infiltrating with small irregular ducts shows solid type. Well differentiated tubular adenocarcinoma

### Response:1
pred string end ### Instruction:
Given the following image caption, output the correct class (0 or 1).

Caption: et al, solid type - The superficial epithelium shows a large sheet-like shape, and tumor tissue infiltrating with small irregular ducts shows solid type. Well differentiated tubular adenocarcinoma

### Response:1
pred s

In [None]:
preds

[-1, 1, 0, 1]

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report
import numpy as np

# Convert to numpy arrays for easy masking
preds_np  = np.array(preds)
labels_np = np.array(test_labels)

# Create a mask for only the valid predictions (i.e. preds != -1)
valid_mask = preds_np != -1

# Apply the mask
preds_valid  = preds_np[valid_mask]
labels_valid = labels_np[valid_mask]

# Now compute metrics
acc    = accuracy_score(labels_valid, preds_valid)
recall = recall_score(labels_valid, preds_valid)      # recall for class “1”
f1     = f1_score(labels_valid, preds_valid)          # F1 for class “1”

print(f"Valid samples: {len(preds_valid)} / {len(preds_np)}")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Recall (class 1): {recall*100:.2f}%")
print(f"F1 (class 1): {f1*100:.2f}%\n")

# Full breakdown (precision/recall/F1 for both classes):
print(classification_report(labels_valid, preds_valid, digits=4))


Valid samples: 334 / 500
Accuracy: 79.94%
Recall (class 1): 79.08%
F1 (class 1): 84.94%

              precision    recall  f1-score   support

       False     0.6094    0.8211    0.6996        95
        True     0.9175    0.7908    0.8494       239

    accuracy                         0.7994       334
   macro avg     0.7634    0.8059    0.7745       334
weighted avg     0.8298    0.7994    0.8068       334



In [None]:
!pip install faiss-cpu



In [None]:
import os, json, pickle
import numpy as np
import faiss

In [None]:
# 1) Paths for your new RAG store
BASE_PATH  = "/content/drive/MyDrive/pcam_rag_index_withlables_new/pcam_rag_index_withlables_new"
INDEX_PATH = os.path.join(BASE_PATH, "caption_image_index.faiss")
META_PATH  = os.path.join(BASE_PATH, "metadata.jsonl")

In [None]:
index = faiss.read_index(INDEX_PATH)
docs  = [json.loads(line) for line in open(META_PATH, "r")]
print(f"Index: {index.ntotal} vectors; metadata entries: {len(docs)}")

Index: 14311 vectors; metadata entries: 14311


In [None]:
PICKLE_IN = "/content/drive/MyDrive/patchcamelyon_embeddings_testdata500.pkl"
with open(PICKLE_IN, "rb") as f:
    test_data = pickle.load(f)

In [None]:
test_images   = test_data["images"]              # list of PIL.Image
test_captions = test_data["captions"]            # list of str
test_labels   = test_data["labels"]              # list of int
test_img_embs = np.array(test_data["image_embeddings"], dtype=np.float32)
test_txt_embs = np.array(test_data["text_embeddings"], dtype=np.float32)

In [None]:
faiss.normalize_L2(test_img_embs)
faiss.normalize_L2(test_txt_embs)

In [None]:
alpha = 1.0          # weight for image
beta  = 1.0 - alpha  # weight for text
k     = 10

In [None]:
img_dim = test_img_embs.shape[1]

In [None]:
rag_results = []
for i in range(len(test_img_embs)):
    # weighted fusion
    q_img = test_img_embs[i] * alpha
    q_txt = test_txt_embs[i] * beta
    q = np.concatenate([q_img, q_txt], axis=0).reshape(1, -1)
    faiss.normalize_L2(q)

    # search
    D, I = index.search(q, k)
    neighs = []
    for score, idx in zip(D[0], I[0]):
        entry = docs[idx].copy()
        neighs.append({
            "id":      entry.get("id"),
            "caption": entry.get("caption"),
            "label":   entry.get("label"),
            "score":   float(score),
            # If you stored the combined embeddings you could also
            # reconstruct them here with index.reconstruct(idx)
        })
    rag_results.append(neighs)

print(f"Retrieved top-{k} neighbors for each of {len(rag_results)} test samples")

Retrieved top-10 neighbors for each of 500 test samples


In [None]:
def make_prompt(test_caption, neighbors):
    # 1) Instruction
    instr = (
        "<image>\n"
        "### Instruction:\n"
        "You are a medical assistant trained to classify histopathologic images as tumor (1) or normal (0).\n"
        "Only respond with a single digit (0 or 1), no explanations.\n"
        "Think step-by-step based on cellular structure and pathology cues.\n\n"
    )

    # 2) Few‐shot examples
    ex_str = "### Examples:\n"
    for n in neighbors:
        ex_str += (
            f"Caption: {n['caption']}\n"
            f"Answer: {n['label']}\n\n"
        )

    # 3) Query
    query = (
        "### Query:\n"
        f"Caption: {test_caption}\n"
        "Answer:"
    )

    return instr + ex_str + query


In [None]:
import json
import torch
from PIL import Image

In [None]:
@torch.no_grad()
def predict_with_few_shot(idx):
    # retrieve neighbors as before…
    neighbors = rag_results[idx]  # list of top-10 dicts

    prompt = make_prompt(test_captions[idx], neighbors)

    inputs = processor(
        images=test_images[idx],
        text=prompt,
        return_tensors="pt",
        padding=True,
        truncation=False,   # ensure the <image> token is never dropped
    ).to(DEVICE)

    gen_ids = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pixel_values=inputs.pixel_values,
    max_new_tokens=2,                # allow up to “<digit>” + EOS
    num_beams=3,                     # beam search for a cleaner single‐token answer
    do_sample=False,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
)
    input_len  = inputs.input_ids.shape[-1]
    new_tokens = gen_ids[0, input_len:].cpu().tolist()
    out_text   = processor.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    print("pred string:", out_text)
    print("pred end:")
    # take the first “word” in case there’s a trailing newline
    pred       = out_text.split()[0] if out_text else None
    return pred


In [None]:
preds = []
for i in range(len(test_images)):
    p = predict_with_few_shot(i)
    print(p)
    preds.append(int(p) if p.isdigit() else None)
    print(f"[{i}] True={test_labels[i]} → Pred={preds[-1]}")

acc = sum(p==t for p,t in zip(preds, test_labels)) / len(test_labels)
print(f"\n10-shot RAG→LLaVA accuracy: {acc:.1%}")




pred string: 0
pred end:
0
[0] True=False → Pred=0
pred string: 1
pred end:
1
[1] True=True → Pred=1
pred string: 0
pred end:
0
[2] True=False → Pred=0
pred string: 1
pred end:
1
[3] True=True → Pred=1
pred string: 0
pred end:
0
[4] True=True → Pred=0
pred string: 1
pred end:
1
[5] True=False → Pred=1
pred string: 1
pred end:
1
[6] True=True → Pred=1
pred string: 1
pred end:
1
[7] True=True → Pred=1
pred string: 1
pred end:
1
[8] True=True → Pred=1
pred string: 0
pred end:
0
[9] True=False → Pred=0
pred string: 0
pred end:
0
[10] True=True → Pred=0
pred string: 0
pred end:
0
[11] True=False → Pred=0
pred string: 1
pred end:
1
[12] True=True → Pred=1
pred string: 1
pred end:
1
[13] True=True → Pred=1
pred string: 0
pred end:
0
[14] True=False → Pred=0
pred string: 0
pred end:
0
[15] True=True → Pred=0
pred string: 1
pred end:
1
[16] True=False → Pred=1
pred string: 1
pred end:
1
[17] True=True → Pred=1
pred string: 1
pred end:
1
[18] True=True → Pred=1
pred string: 0
pred end:
0
[19] Tr

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report
import numpy as np

# Convert to numpy arrays for easy masking
preds_np  = np.array(preds)
labels_np = np.array(test_labels)

# Create a mask for only the valid predictions (i.e. preds != -1)
valid_mask = preds_np != -1

# Apply the mask
preds_valid  = preds_np[valid_mask]
labels_valid = labels_np[valid_mask]

# Now compute metrics
acc    = accuracy_score(labels_valid, preds_valid)
recall = recall_score(labels_valid, preds_valid)      # recall for class “1”
f1     = f1_score(labels_valid, preds_valid)          # F1 for class “1”

print(f"Valid samples: {len(preds_valid)} / {len(preds_np)}")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Recall (class 1): {recall*100:.2f}%")
print(f"F1 (class 1): {f1*100:.2f}%\n")

# Full breakdown (precision/recall/F1 for both classes):
print(classification_report(labels_valid, preds_valid, digits=4))


Valid samples: 500 / 500
Accuracy: 74.20%
Recall (class 1): 83.90%
F1 (class 1): 77.64%

              precision    recall  f1-score   support

       False     0.7737    0.6309    0.6950       233
        True     0.7226    0.8390    0.7764       267

    accuracy                         0.7420       500
   macro avg     0.7481    0.7349    0.7357       500
weighted avg     0.7464    0.7420    0.7385       500



In [None]:
def build_prompt(test_caption, neighbors):
    # 1) Always start with the image token
    prompt = "<image>\n"

    # 2) System instruction
    prompt += (
        "### System:\n"
        "You are a board-certified digital pathologist.\n"
        "Given a histopathology image and its textual caption, decide whether there is tumor (1) or normal tissue (0).\n"
        "Think step by step (nuclear morphology → glandular architecture → invasion patterns), then give only the final label (0 or 1).\n\n"
    )

    # 3) Few-shot examples
    prompt += "### Examples:\n"
    for ex in neighbors:
        prompt += (
            f"Caption: {ex['caption']}\n"
            f"Diagnosis: {ex['label']}\n\n"
        )

    # 4) Query section
    prompt += (
        "### Query:\n"
        f"Caption: {test_caption}\n"
        "Diagnosis:"
    )

    return prompt


In [None]:
@torch.no_grad()
def predict_with_few_shot(idx):
    # retrieve top-10 neighbors as before…
    neighbors = rag_results[idx]  # list of dicts with 'caption' & 'label'

    # build the text prompt
    prompt = build_prompt(test_captions[idx], neighbors)

    # run LLaVA
    inputs = processor(
        images=test_images[idx],
        text=prompt,
        return_tensors="pt",
        padding=True,
        truncation=False,   # keep the <image> placeholder
    ).to(DEVICE)

    gen = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        pixel_values=inputs.pixel_values,
        max_new_tokens=2,
        num_beams=3,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id,
        eos_token_id= processor.tokenizer.eos_token_id,
    )

    # extract just the newly generated tokens
    in_len   = inputs.input_ids.shape[-1]
    new_ids  = gen[0, in_len:].cpu().tolist()
    out_text = processor.tokenizer.decode(new_ids, skip_special_tokens=True).strip()
    # grab the first digit (0 or 1)
    return out_text.split()[0]


In [None]:
preds = []
for i in range(len(test_images)):
    p = predict_with_few_shot(i)
    print(p)
    preds.append(int(p) if p.isdigit() else None)
    print(f"[{i}] True={test_labels[i]} → Pred={preds[-1]}")

acc = sum(p==t for p,t in zip(preds, test_labels)) / len(test_labels)
print(f"\n10-shot RAG→LLaVA accuracy: {acc:.1%}")


0
[0] True=False → Pred=0
1
[1] True=True → Pred=1
0
[2] True=False → Pred=0
1
[3] True=True → Pred=1
0
[4] True=True → Pred=0
1
[5] True=False → Pred=1
1
[6] True=True → Pred=1
1
[7] True=True → Pred=1
1
[8] True=True → Pred=1
0
[9] True=False → Pred=0
1
[10] True=True → Pred=1
1
[11] True=False → Pred=1
1
[12] True=True → Pred=1
1
[13] True=True → Pred=1
1
[14] True=False → Pred=1
1
[15] True=True → Pred=1
1
[16] True=False → Pred=1
1
[17] True=True → Pred=1
1
[18] True=True → Pred=1
0
[19] True=True → Pred=0
1
[20] True=True → Pred=1
0
[21] True=False → Pred=0
0
[22] True=False → Pred=0
0
[23] True=False → Pred=0
0
[24] True=False → Pred=0
1
[25] True=True → Pred=1
1
[26] True=True → Pred=1
1
[27] True=False → Pred=1
1
[28] True=True → Pred=1
1
[29] True=True → Pred=1
0
[30] True=True → Pred=0
1
[31] True=True → Pred=1
1
[32] True=True → Pred=1
1
[33] True=True → Pred=1
1
[34] True=True → Pred=1
1
[35] True=True → Pred=1
1
[36] True=True → Pred=1
1
[37] True=False → Pred=1
1
[38] Tr