In [1]:
from transformers import FuyuProcessor, FuyuForCausalLM, BitsAndBytesConfig
from PIL import Image

import requests
import torch

# load model and processor
CACHE_DIR = "./cache"
model_id = "adept/fuyu-8b"

# The current model is not compatible with 4-bit/8-bit quantization.
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=torch.float16,
#     llm_int8_enable_fp32_cpu_offload=True
# )

processor = FuyuProcessor.from_pretrained(model_id, cache_dir=CACHE_DIR)
model = FuyuForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR, device_map="cuda:0", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# prepare inputs for the model
text_prompt = "Generate a coco-style caption.\n"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
generation_output = model.generate(**inputs, max_new_tokens=7)
generation_text = processor.batch_decode(generation_output[:, -7:], skip_special_tokens=True)

print(generation_text)
assert generation_text == ['A blue bus parked on the side of a road.']

Setting `pad_token_id` to `eos_token_id`:71013 for open-end generation.


['A blue bus parked on the side of a road.']


In [10]:
text_prompt = "What color is the bus?\n"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
generation_output = model.generate(**inputs, max_new_tokens=6)
generation_text = processor.batch_decode(generation_output[:, -6:], skip_special_tokens=True)

print(generation_text)
assert generation_text == ["The bus is blue.\n"]

Setting `pad_token_id` to `eos_token_id`:71013 for open-end generation.


['The bus is blue.\n']


In [None]:
text_prompt = "What is the highest life expectancy at birth of male?\n"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
image = Image.open(requests.get(url, stream=True).raw)

model_inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
generation_output = model.generate(**model_inputs, max_new_tokens=16)
generation_text = processor.batch_decode(generation_output[:, -16:], skip_special_tokens=True)

print(generation_text)
assert generation_text == ["The life expectancy at birth of males in 2018 is 80.7.\n"]

Setting `pad_token_id` to `eos_token_id`:71013 for open-end generation.


['The life expectancy at birth for males is 80.2, while the highest life expectancy at']
