In [None]:
!pip install -q bitsandbytes accelerate
!pip list | grep transformers
!pip list | grep bitsandbytes
!pip list | grep accelerate

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch as t
from PIL import Image
import requests

In [None]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")
print(f'Device: {device}')

In [None]:
# for storing params as 4-bit numbers
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=t.float16
)

In [None]:
model_id = "llava-hf/llava-v1.6-vicuna-7b-hf"

model = LlavaNextForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=t.float16,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)
# this is not needed here as BitsAndBytes automatically puts the model on device nad casts the correct dtype
# model.to(device)
vocab_size = model.vocab_size

processor = LlavaNextProcessor.from_pretrained(model_id)
tokenizer = processor.tokenizer

In [None]:
# prepare image and text prompt, using the appropriate prompt template
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
prompt_tokenized = tokenizer.encode(prompt)
len_prompt_tokenized = len(prompt_tokenized)
if type(prompt) == str:
    batch_size = 1
else:
    batch_size = len(prompt)

inputs = processor(prompt, image, return_tensors="pt").to(device)
target = "The image appears to be a graphical representation of a machine learning model's performance on various tasks."

target_tokenized = t.tensor(tokenizer.encode(target)).to(device)
len_target_tokenized = len(target_tokenized)

# farget target of random tokens, with '1' (BOS) as the first token
# target_tokenized = t.cat((t.tensor([1]), t.randint(1, vocab_size, (len_target_tokenized-1, )))).to(device)

# autoregressively complete prompt
# 1. I don't get the difference between scores and logits:
# t.allclose(output.scores[12], output.logits[12]) <-- True
# https://github.com/huggingface/transformers/issues/14498#issuecomment-2110456886
#
# 2. The '-1' is because target_tokenized includes a BOS token, which we're not including in the slice below which is selecting the ASSISTANT's response
output = model.generate(
    **inputs,
    max_new_tokens=len_target_tokenized-1,
    return_dict_in_generate=True,
    output_scores=False,
    output_logits=True,
    output_hidden_states=False,
    output_attentions=False,
    do_sample=False, # if False, it will use greedy encoding. Not sure if I should set it to True
    use_cache=True # cache previous key and value vectors
    )

response_tokenized = output.sequences
logits = t.stack(output.logits, dim=1) # [batch_size, seq_len, d_vocab]
if batch_size == 1:
    response_tokenized = response_tokenized.squeeze(0) # [seq_len]
    logits = logits.squeeze(0) # [seq_len, d_vocab]
else:
    raise NotImplementedError('Batch size > 1 not implemented yet')

response_tokenized = response_tokenized[len_prompt_tokenized:]
response = tokenizer.decode(response_tokenized, skip_special_tokens=False)
print(response)

In [None]:
print(prompt_tokenized)
print(target_tokenized)
print(response_tokenized)

In [None]:
loss_fn = t.nn.CrossEntropyLoss()
loss = loss_fn(logits, target_tokenized[1:])
print(loss)