In [6]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModelForCausalLM

In [None]:
import json

In [15]:
from google.colab import files
uploaded = files.upload()

Saving captions.json to captions (1).json


In [8]:
# image loading and preprocessing
def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(images=image, return_tensors="pt")
    return inputs, processor

In [9]:
# image understanding with CLIP
def generate_image_embeddings(inputs):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features, model

In [10]:
# caption matching (using CLIP text embeddings)
def match_captions(image_features, captions, clip_model, processor):
    # 1. get text embeddings for the captions:
    text_inputs = processor(text=captions, return_tensors="pt", padding=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)

    # 2. calculate cosine similarity between image and text features:
    image_features = image_features.detach().cpu().numpy()
    text_features = text_features.detach().cpu().numpy()

    similarities = cosine_similarity(image_features, text_features)

    # 3. find the best matching captions:
    best_indices = similarities.argsort(axis=1)[0][::-1]
    best_captions = [captions[i] for i in best_indices]

    return best_captions, similarities[0][best_indices].tolist()

In [11]:
# main function
def image_captioning(image_path, candidate_captions):
    inputs, processor = load_and_preprocess_image(image_path)
    image_features, clip_model = generate_image_embeddings(inputs)

    best_captions, similarities = match_captions(image_features, candidate_captions, clip_model, processor)
    return best_captions, similarities

In [16]:
# Assuming only one file is uploaded
import io
filename = next(iter(uploaded))  # gets the filename
with io.open(filename, 'r', encoding='utf-8') as f:
    data = json.load(f)
    candidate_captions = data["captions"]


In [20]:
from google.colab import files
uploaded = files.upload()

Saving 655d3339e4b9af54f18fabd9bca210a5.jpg to 655d3339e4b9af54f18fabd9bca210a5.jpg


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

best_captions, similarities = image_captioning("655d3339e4b9af54f18fabd9bca210a5.jpg", candidate_captions)

top_n = min(5, len(best_captions))
top_best_captions = best_captions[:top_n]
top_similarities = similarities[:top_n]

print("Top 5 Best Captions:")
for i, (caption, similarity) in enumerate(zip(top_best_captions, top_similarities)):
    print(f"{i+1}. {caption} (Similarity: {similarity:.4f})")


Top 5 Best Captions:
1. POV: i’m your Uber drive (Similarity: 0.3042)
2. omw (Similarity: 0.2726)
3. GPS said it’s my turn (Similarity: 0.2664)
4. they see me rollin (Similarity: 0.2642)
5. drive fast, don’t crash (Similarity: 0.2620)
