In [1]:
#Import Libraries
import requests
from PIL import Image
import gradio as gr
import numpy as np
import glob, os
import pandas as pd
from transformers import AutoProcessor, BlipForConditionalGeneration

# Load the pretrained processor and model
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

#Note: You can use the Blip2 model, which is a more powerful pre-trained model for image captioning. 
#In fact, you can easily incorporate any new pre-trained model that becomes available, as they are continuously developed to be more powerful.  
# However, please be aware that the Blip2 model requires 10GB of space, model 

#Here is the link to the documentation for Blip2: https://huggingface.co/docs/transformers/main/model_doc/blip-2
'''
from transformers import Blip2Processor, Blip2ForConditionalGeneration #Blip2 models
# Load the pretrained processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
'''


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
#This function captions one image used as an input to a User Interface
def caption_image(input_image: np.ndarray):
    # Convert numpy array to PIL Image and convert to RGB
    raw_image = Image.fromarray(input_image).convert('RGB')
    
    # Process the image
    text = "the image of"
    inputs = processor(images=image, text=text, return_tensors="pt")

    # Generate a caption for the image
    outputs = model.generate(**inputs, max_length=50)

    # Decode the generated tokens to text and store it into `caption`
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    
    return caption

In [17]:
def launch_gradio_interface():
    iface = gr.Interface(
    fn=caption_image, 
    inputs=gr.Image(), 
    outputs="text",
    title="Image Captioning",
    description="This is a simple web app for generating captions for images using a trained model."
    )
    iface.launch()

In [18]:
# The following code function is used to take a directory as input, and provide captions of all images in that directory
def auto_caption_dir_images(image_dir):
    image_exts = ["jpg", "jpeg", "png"]
    # Open a file to write the captions
    N = 0
    data = []
    with open("captions.txt", "w") as caption_file:
        # Iterate over each image file in the directory
        for ext in image_exts:
            # Match both lowercase & uppercase extensions
            pattern = os.path.join(image_dir, f"**/*.{ext}")
            pattern_upper = os.path.join(image_dir, f"**/*.{ext.upper()}")
            
            for img_path in glob.glob(pattern, recursive=True) + glob.glob(pattern_upper, recursive=True):
    
                print("Processing:", img_path)
                
                raw_image = Image.open(img_path).convert("RGB")
    
                inputs = processor(raw_image, return_tensors="pt")
                out = model.generate(**inputs, max_new_tokens=50)
    
                caption = processor.decode(out[0], skip_special_tokens=True)
    
                caption_file.write(f"{os.path.basename(img_path)}: {caption}\n")
                N = N+1

                data.append({"filename": str(os.path.basename(img_path)), "extension": str(ext),"caption": caption})
    
    print(f"Caption text file has been generated for {N} images")

    output_path = os.path.join(image_dir, "Caption_df.xlsx")
    df = pd.DataFrame(data)
    df.to_excel(output_path, index=False)

    return df

In [19]:
#The following code launches the Gradio interface, where you can upload the image and obtain its caption
launch_gradio_interface()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


In [20]:
# The following is for obtaining captions for all the images in a user-input directory
input_dir = input("Enter the image - directory: ")
df = auto_caption_dir_images(input_dir)

KeyboardInterrupt: Interrupted by user