
# Capstone Project: Multimodal AI Application

This notebook uses BLIP to caption images and CLIP to match images with text prompts. Great for accessibility, AR/VR, and semantic search.


In [None]:

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Load and process image
image_url = "https://huggingface.co/datasets/nateraw/image-captioning-images/resolve/main/beach.png"
image = Image.open(requests.get(image_url, stream=True).raw)

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

inputs = processor(image, return_tensors="pt")
output = model.generate(**inputs)
caption = processor.decode(output[0], skip_special_tokens=True)

print("Caption:", caption)



## CLIP Matching (Bonus)

Use CLIP to match text to images or vice versa.


In [None]:

from transformers import CLIPProcessor, CLIPModel

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open(requests.get("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/image_classification.png", stream=True).raw)
texts = ["a diagram", "a cat", "a man", "a chart of transformers"]

inputs = clip_processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image.softmax(dim=1)

print("Matching scores:", dict(zip(texts, logits_per_image[0].tolist())))



## Project Ideas

- Caption your dataset of real-estate/property images
- Build a visual QA system with BLIP-2
- Deploy with Streamlit or Gradio

Have fun combining vision and language models!
