In [2]:
import torch  
from PIL import Image  
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast  
import os  

# 精简版 本地图片加载，不再验证URL  
def load_image(image_path):  
    if os.path.exists(image_path):  
        return Image.open(image_path)  
    else:  
        raise ValueError(f"Invalid image path: {image_path}")  

class ImageCaptioningLitLocal:  
    def setup(self, model_dir, accelerator='auto'):  
        # 设备自动选择  
        if accelerator == "cuda" and torch.cuda.is_available():  
            self.device = "cuda"  
        elif accelerator == "auto":  
            self.device = "cuda" if torch.cuda.is_available() else "cpu"  
        else:  
            self.device = "cpu"  

        print(f"Using device: {self.device}")  

        # 加载本地模型和对应的tokenizer与processor  
        self.model = VisionEncoderDecoderModel.from_pretrained(model_dir).to(self.device)  
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_dir)  
        self.image_processor = ViTImageProcessor.from_pretrained(model_dir)  

    def predict(self, image_path):  
        image = load_image(image_path)  
        # 处理图片，转tensor并转设备  
        pixel_values = self.image_processor(image, return_tensors="pt").pixel_values.to(self.device)  

        # 生成结果  
        outputs = self.model.generate(pixel_values)  
        caption = self.tokenizer.decode(outputs[0], skip_special_tokens=True)  
        return caption  

# 实例化与加载  
model_path = r"F:\Workplace\Image-captioning-ViT-main\models\vit-gpt2-image-captioning"  
image_dir = r"F:\Workplace\Image-captioning-ViT-main\image"  

api = ImageCaptioningLitLocal()  
api.setup(model_dir=model_path, accelerator="cuda")  

VisionEncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Using device: cpu




In [3]:
# Example 1
image_path_1 = os.path.join(image_dir, "img1.jpg")  
print("Caption 1:", api.predict(image_path_1))  

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Caption 1: a black horse running through a grassy field 


In [4]:
# Example 2
image_path_2 = os.path.join(image_dir, "img2.jpg")  
print("Caption 2:", api.predict(image_path_2))  

Caption 2: a man standing on top of a hill with a mountain 
