In [1]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

In [2]:
# Load model, feature extractor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define directory to save the model
save_directory = "./saved_model"

# Save the model
model.save_pretrained(save_directory)

# Save the feature extractor
feature_extractor.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)




('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [None]:
# # Load the model
# model = VisionEncoderDecoderModel.from_pretrained(save_directory)
# feature_extractor = ViTFeatureExtractor.from_pretrained(save_directory)
# tokenizer = AutoTokenizer.from_pretrained(save_directory)


In [3]:
# Define generation parameters

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [4]:
# def load_image_paths(folder):
#     image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".gif")
#     return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(image_extensions)]

# def save_captions(output_file, captions):
#     with open(output_file, "w") as f:
#         for image_path, caption in captions:
#             f.write(f"{image_path}\t{caption}\n")

# class ImageCaptioningModel:
#     def __init__(self):
#         self.model = model
#         self.feature_extractor = feature_extractor
#         self.tokenizer = tokenizer
#         self.device = device
#         self.gen_kwargs = gen_kwargs

#     def predict_step(self, image_paths):
#         images = []
#         for image_path in image_paths:
#             try:
#                 i_image = Image.open(image_path)
#                 if i_image.mode != "RGB":
#                     i_image = i_image.convert(mode="RGB")
#                 images.append(i_image)
#             except Exception as e:
#                 print(f"Error loading image {image_path}: {e}")
#                 continue

#         if not images:
#             return []

#         pixel_values = self.feature_extractor(images=images, return_tensors="pt").pixel_values
#         pixel_values = pixel_values.to(self.device)

#         output_ids = self.model.generate(pixel_values, **self.gen_kwargs)

#         preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
#         preds = [pred.strip() for pred in preds]
#         return preds


In [6]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [7]:
predict_step(['headshot_Ishak.jpg'])

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['a man with a beard wearing a green jacket']

In [14]:
import os

In [15]:
# Define paths and parameters
image_folder = "picture"  # Update this path
output_file = "captions.txt"
batch_size = 4

# Initialize the model
caption_model = ImageCaptioningModel()

# Load image paths
image_paths = load_image_paths(image_folder)

# Generate captions
all_captions = []
for i in range(0, len(image_paths), batch_size):
    batch_paths = image_paths[i:i + batch_size]
    captions = caption_model.predict_step(batch_paths)
    all_captions.extend(zip(batch_paths, captions))

# Save captions to a file
save_captions(output_file, all_captions)


In [16]:
import pandas as pd

# Load and display some of the results
captions_df = pd.DataFrame(all_captions, columns=["Image Path", "Caption"])
captions_df.head()


Unnamed: 0,Image Path,Caption
0,picture/hello3.jpg,a black and white dog and a black and white dog
1,picture/headshot_Ishak.jpg,a man with a beard wearing a green jacket
