In [None]:
from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
import requests
import torch
import numpy as np
from PIL import Image
import pickle
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
model_raw = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [None]:
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer       = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [None]:
# Assuming image_processor, model, and tokenizer are already defined or imported elsewhere

def show_n_generate(source, greedy=True, model=model_raw):
    # Check if the source is a URL or a local file path based on common URL patterns
    if source.startswith(('http://', 'https://')):
        # Load image from a URL
        image = Image.open(requests.get(source, stream=True).raw)
    else:
        # Load image from a local file
        image = Image.open(source)
    
    # Display the image
    plt.imshow(image)
    plt.show()

    # Process image to get pixel values
    pixel_values = image_processor(image, return_tensors="pt").pixel_values
    
    # Text generation with the model
    if greedy:
        generated_ids = model.generate(pixel_values, max_new_tokens=30)
    else:
        generated_ids = model.generate(
            pixel_values,
            do_sample=True,
            max_new_tokens=30,
            top_k=5
        )
    
    # Decode generated text
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

# Example usage with a local file path
local_path = "/kaggle/input/natural-images/natural_images/dog/dog_0007.jpg"
show_n_generate(local_path)
