## Multimodal

In [None]:
!pip install transformers torch bitsandbytes accelerate datasets peft

In [None]:
def show_model_size(model):
    config = model.config
    q=0

    if hasattr(config, "quantization_config") and config.quantization_config is not None:
        q_config = config.quantization_config

        if hasattr(q_config, "load_in_4bit") and q_config.load_in_4bit == True:
            q = 4
        elif hasattr(q_config, "load_in_8bit") and q_config.load_in_8bit == True:
            q = 8
    else:
        if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
            q = config.torch_dtype.itemsize * 8

    gbs = model.get_memory_footprint() / 1e9
    print(f"----- {q}-bit Model -----")
    print(f"Number of parameters: {model.num_parameters():,}")
    print(f"Memory footprint if FP32: {model.num_parameters()*4/1e9:.2f} GB")
    print(f"Memory footprint: {gbs:.2f} GB")
    print(f"Model device: {next(model.parameters()).device}")

### Load model as 4-bit

In [None]:
from transformers import AutoProcessor, AutoModelForPreTraining, BitsAndBytesConfig
import torch

MODEL = "llava-hf/llava-1.5-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained(MODEL)

model = AutoModelForPreTraining.from_pretrained(
    MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16
)

In [None]:
show_model_size(model)

### Load images

In [None]:
from transformers.image_utils import load_image

image1 = load_image("/content/content/bird+walking+on+grass.jpeg")
image2 = load_image("/content/content/cat+sitting+on+table.jpeg")

In [None]:
image1

In [None]:
image2

#### Generate Captions

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Describe this image."},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "A small brown bird standing on top of a lush green field of grass."},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Describe this image."},
        ]
    }
]

prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1, image2], return_tensors="pt")

model_device = next(model.parameters()).device
inputs = {k: v.to(model_device) for k, v in inputs.items()}

In [None]:
print(prompt)

In [None]:
generate_ids = model.generate(**inputs, max_new_tokens=100, eos_token_id=processor.tokenizer.eos_token_id, do_sample=True, temperature=0.7)
generated_texts = processor.batch_decode(generate_ids, skip_special_tokens=True)
print(generated_texts)