In [1]:
!pip install peft bitsandbytes gradio

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11

In [3]:
from PIL import Image
import torch
from transformers import AutoTokenizer, AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
from peft import PeftModel
import gradio as gr
from google.colab import drive
drive.mount('/content/drive')


OUTPUT_DIR = "/content/drive/MyDrive/EnigmaAI/models/fine_tuned_model"  #path where the fine tuned model was saved
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Loading model components
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 2. Loading the base model
base_model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=quant_config,
    torch_dtype=torch.float16
).to(device)

# 3. Wrapping with PeftModel to load the fine tuned LoRA adapter and merging with base model
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR).to(device)
model = model.merge_and_unload()
model.eval()

# Conversation template for description
conversation = [
    {"role": "user", "content": [{"type": "image", "text": "What is in the image?"}]}
]
# 4. Function to generate the description
def generate_caption(image):
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(image, prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eos_token_id)
    caption = tokenizer.decode(output[0], skip_special_tokens=True)
    return caption

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [4]:
# Creating Gradio interface
iface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="numpy", label="Upload Image"),
    outputs="text",
    title="Scene description",
    description="Upload an image and generate a caption using fine-tuned  model."
)

# Launch the interface with sharing enabled to get a public URL
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://16000395aef283dc52.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


