<a href="https://colab.research.google.com/github/kaashifatanveer/Image_To_Text_Generator_Prj/blob/main/Image_To_Text_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

In [17]:
# Load the model, feature extractor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [6]:
# Parameters for caption generation
max_length = 32  # Increased max_length for more detailed captions
num_beams = 8    # Increased num_beams for better exploration during generation
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "early_stopping": True}


In [7]:
def predict_step(image_paths):
    images = []

    # Preprocess each image
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        # Resize image to 224x224 pixels
        i_image = i_image.resize((224, 224))
        images.append(i_image)

    # Extract pixel values and move to the appropriate device
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # Generate captions
    output_ids = model.generate(pixel_values, **gen_kwargs)

    # Decode the generated ids to captions
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [10]:
captions = predict_step(['/content/Aeroplane in sky.jpeg'])
print(captions)

['a large jetliner flying through a blue sky']


In [12]:
captions = predict_step(['/content/Cat and Dog.jpeg'])
print(captions)

['a cat and a dog sitting on the floor']


In [13]:
captions = predict_step(['/content/A Baby in grass.jpeg'])
print(captions)

['a baby sitting in a field of tall grass']


In [14]:
captions = predict_step(['/content/Dog with a Ball.jpeg'])
print(captions)

['a dog playing with a tennis ball on a tennis court']


In [16]:
captions = predict_step(['/content/A woman holding cellphone.jpeg'])
print(captions)

['a woman holding a cell phone in her hand']
