# Using Gemma 3 (4B) to identify images

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/04/notebooks/vlm-gemma-3-4b.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/04/notebooks/vlm-gemma-3-4b.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

In [4]:
from transformers import pipeline
import torch
import os
import sys

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
  print("Replicate API Token set for Colab")
else:
  load_dotenv()
  print("Loaded env vars from .env")


pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda",
    dtype=torch.bfloat16
)

Replicate API Token set for Colab


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/883 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

The image processor of type `Gemma3ImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [6]:
import gradio as gr

def analyze(image, prompt, max_new_tokens=256, system_prompt="You are a medical expert."):
    if image is None:
        return "Please upload an image."

    if prompt is None or not prompt.strip():
        prompt = "Describe the image."

    # Chat-style messages: include the actual PIL image in the message content
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},     # <-- pass PIL image here
                {"type": "text", "text": prompt},
            ],
        },
    ]

    # Run inference (no images=[image] when image is embedded in messages)
    outputs = pipe(
        text=messages,
        max_new_tokens=int(max_new_tokens),
        return_full_text=False,
    )

    # Robust extraction across pipeline variants
    if isinstance(outputs, list) and len(outputs) > 0:
        first = outputs[0]
        if isinstance(first, dict):
            # Most common: {"generated_text": "..."}
            if "generated_text" in first:
                return str(first["generated_text"])

            # Some pipelines: {"text": "..."}
            if "text" in first:
                return str(first["text"])

            return str(first)

    return str(outputs)


with gr.Blocks(title="Medical Expert") as demo:
    gr.Markdown("## Gemma 3 Vision — upload an image + ask a question")

    with gr.Row():
        img = gr.Image(type="pil", label="Input image")
        with gr.Column():
            prompt = gr.Textbox(lines=4, label="Prompt", value="What is in this image?")
            max_new_tokens = gr.Slider(32, 1024, value=256, step=32, label="Max new tokens")
            system_prompt = gr.Textbox(lines=2, label="System prompt", value="You are a medical expert.")
            btn = gr.Button("Run")

    output = gr.Textbox(lines=12, label="Model output")

    btn.click(
        fn=analyze,
        inputs=[img, prompt, max_new_tokens, system_prompt],
        outputs=output,
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://920f1348657f2f69fc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


