[From RoboSpatial](https://github.com/chanhee-luke/RoboSpatial-Eval/blob/master/models.py)

In [None]:
from PIL import Image
from transformers import GenerationConfig

def generate_answer(image_path, question, model, processor, **kwargs):
    inputs = processor.process(
        images=[Image.open(image_path)],
        text=question
    )

    # Move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    # Create a GenerationConfig and update it with any additional kwargs
    generation_config = GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>")
    for key, value in kwargs.items():
        setattr(generation_config, key, value)

    # Generate output
    output = model.generate_from_batch(
        inputs,
        generation_config,
        tokenizer=processor.tokenizer
    )

    # Extract generated tokens and decode them to text
    generated_tokens = output[0, inputs['input_ids'].size(1):]
    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return generated_text

model = kwargs["model"]
processor = kwargs["processor"]
# Remove model and processor from kwargs to avoid conflicts
generation_kwargs = {k: v for k, v in kwargs.items() if k not in ["model", "processor"]}
generated_text = generate_answer(image_path, question, model, processor, **generation_kwargs)
return generated_text