# Start of the project
---

## Dataset setup


In [None]:
!pip install -q "git+https://github.com/huggingface/transformers.git" accelerate bitsandbytes sentencepiece

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
import albumentations as A
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import timm
from torch.amp import GradScaler
from torch.utils.data import Dataset, DataLoader
from glob import glob
from google.colab import drive
from albumentations.pytorch import ToTensorV2
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, accuracy_score, recall_score

In [2]:
drive.mount("/content/drive")

path = "/content/drive/MyDrive/Project_Robt_310/Dataset"
classes   = ["Clear", "Lump"]
splits    = ["Train", "Val", "Test"]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
images  = {s: [] for s in splits}
labels = {s: [] for s in splits}

for s in splits:
    for i, c in enumerate(classes):
        paths = sorted(glob(f"{path}/{c}/{s}/*.png"))  # PNG only
        images[s]  += paths
        labels[s] += [i] * len(paths)

print(len(images["Train"]), "train images")


274 train images


In [4]:
IMG_SIZE = 224  # good for 500×500;
train_tfms = A.Compose([
A.Resize(IMG_SIZE, IMG_SIZE),
A.HorizontalFlip(p=0.5),


# Slight affine transformations for augmentation
A.Affine(
scale=(0.94, 1.06),
translate_percent={"x": (-0.02, 0.02), "y": (-0.02, 0.02)},
rotate=(-6, 6),
p=0.4
),


# Speckle-like and light Gaussian blur
A.MultiplicativeNoise(multiplier=(0.9, 1.1), per_channel=False, p=0.3),
A.GaussianBlur(blur_limit=(3, 5), p=0.2),


A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensorV2()
])


# Validation transformations
val_tfms = A.Compose([
A.Resize(IMG_SIZE, IMG_SIZE),

A.HorizontalFlip(p=0.5),


# Slight affine transformations for augmentation
A.Affine(
scale=(0.94, 1.06),
translate_percent={"x": (-0.02, 0.02), "y": (-0.02, 0.02)},
rotate=(-6, 6),
p=0.4
),


# Speckle-like and light Gaussian blur
A.MultiplicativeNoise(multiplier=(0.9, 1.1), per_channel=False, p=0.3),
A.GaussianBlur(blur_limit=(3, 5), p=0.2),
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensorV2()
])


In [5]:
class Dataset(Dataset):
    def __init__(self, path, label, transform):
        self.path, self.label, self.transform = path, label, transform
    # store the files path, labels, and corresponding transformation

    def __len__(self): return len(self.path)
    # length of the dataset


    def __getitem__(self, i):
        img = Image.open(self.path[i]).convert("RGB") # open image file from
        # the disk
        x = self.transform(image=np.array(img))["image"] # preprocessing of the image
        # assign matrix of the image
        y = self.label[i] # assign a label
        return x, y

train_ds = Dataset(images["Train"], labels["Train"], train_tfms)
val_ds   = Dataset(images["Val"],   labels["Val"],   val_tfms)
test_ds  = Dataset(images["Test"],  labels["Test"],  val_tfms)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

len(train_ds), len(val_ds), len(test_ds)


(274, 93, 92)

In [None]:
import torch
from transformers import LlavaForConditionalGeneration, AutoProcessor

model_path = "chaoyinshe/llava-med-v1.5-mistral-7b-hf"

# safer dtype selection
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    dtype = torch.bfloat16
else:
    dtype = torch.float16

# If you *do* have flash_attention_2 installed, you can switch "sdpa" -> "flash_attention_2"
model = LlavaForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=dtype,
    attn_implementation="sdpa",     # "flash_attention_2" if your env supports it
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from PIL import Image

def llava_med_ultrasound_zero_shot(image_path: str, history_text: str = "") -> str:
    """
    Zero-shot inference with LLaVA-Med (chaoyinshe/llava-med-v1.5-mistral-7b-hf)
    for breast ultrasound with TWO classes: Clear vs Lump.
    """

    # 1. Load image
    image = Image.open(image_path).convert("RGB")

    # 2. Build textual part of the prompt
    if history_text:
        hist = f"Patient history: {history_text}\n"
    else:
        hist = "Patient history: not provided.\n"

    task_text = (
        f"{hist}"
        "This is a breast ultrasound image.\n"
        "Task:\n"
        "1. Decide whether the image is best described as one of the following two categories:\n"
        "   - Clear (no visible lump on ultrasound)\n"
        "   - Lump (a suspicious focal lesion / mass is visible)\n"
        "2. Answer in the following format:\n"
        "   Class: <Clear / Lump>\n"
        "   Explanation: <1–3 sentences explaining key visual findings>\n"
        "   Recommendation: <1–2 sentences with cautious follow-up suggestions>\n"
    )

    # 3. Chat-style messages (image placeholder + text)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": task_text},
            ],
        }
    ]

    # 4. Build chat prompt string
    prompt = processor.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 5. Processor builds model inputs (image + text)
    inputs = processor(
        images=[image],
        text=prompt,
        return_tensors="pt"
    ).to(model.device, dtype)

    # 6. Generate
    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            temperature=0.1,
        )

    # 7. Decode
    answer = processor.decode(out[0], skip_special_tokens=True)
    return answer


In [None]:
import random

# Example: run on 3 random test images
sample_paths = random.sample(images["Test"], k=min(3, len(images["Test"])))

for p in sample_paths:
    print("\n==============================")
    print("Image:", p)
    res = llava_med_ultrasound_zero_shot(
        image_path=p,
        history_text="45-year-old female; screening breast ultrasound."
    )
    print(res)
    print("==============================")
