In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
import os

# Load the model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Set up generation parameters
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

# Define function to predict captions for multiple images
def predict_batch(image_paths, feature_extractor, model, tokenizer, device, gen_kwargs, batch_size=8):
    captions = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        images = []
        for image_path in batch_paths:
            i_image = Image.open(image_path)
            if i_image.mode != "RGB":
                i_image = i_image.convert(mode="RGB")
            images.append(i_image)

        # Convert images to pixel values
        pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values

        # Generate captions
        output_ids = model.generate(pixel_values, **gen_kwargs)

        # Decode captions
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        preds = [pred.strip() for pred in preds]
        captions.extend(preds)
    return captions

# Set the directory containing all the images
image_dir = '/content/drive/MyDrive/Intern/frames'

# Get a list of all image paths in the directory
image_paths = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir) if filename.endswith(('.jpg', '.jpeg', '.png'))]
# Define the Torch device
device = torch.device("cpu")

# Predict captions for all images
predicted_captions = predict_batch(image_paths, feature_extractor, model, tokenizer, device, gen_kwargs)

# Decode the predicted captions using the tokenizer
decoded_captions = tokenizer.batch_decode(predicted_captions, skip_special_tokens=True)

# Print the predicted captions
for image_path, caption in zip(image_paths, decoded_captions):
    print(f"Image: {image_path}, Caption: {caption}")
