In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_HUB_OFFLINE"] = "1"


In [2]:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model

model_path = "liuhaotian/llava-v1.5-7b"

tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_base=None,
    model_name=get_model_name_from_path(model_path)
)


You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from PIL import Image
import torch

def load_image(image_file):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image

conv = conv_templates["llava_v1"].copy()

image_path_or_url = "examples/image.png"
prompt_text = "Describe this image."

image_data = load_image(image_path_or_url)
image_tensor = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()
image_size = image_data.size
if type(image_tensor) is list:
    image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
else:
    image_tensor = image_tensor.to(model.device, dtype=torch.float16)

if model.config.mm_use_im_start_end:
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt_text
else:
    inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt_text


conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)


In [4]:
input_ids[0][35]


tensor(-200, device='cuda:0')

In [13]:
with torch.inference_mode():
    outputs = model.generate(
        input_ids,
        images=image_tensor,
        image_sizes=[image_size],
        do_sample=False,
        max_new_tokens=512,
        use_cache=True,
        return_dict_in_generate=True,
        output_attentions=True,
    )

text = tokenizer.decode(outputs["sequences"][0]).strip()
print(text)


The image displays two columns of numbers, likely representing financial data or calculations. The first column, labeled "Company A," shows a total of $1,148,000. The second column, labeled "Company B," shows a total of $11,480,000. The difference between the two columns is quite significant, with Company B having a much larger amount than Company A.</s>


In [42]:
from typing import List, Optional, Tuple, Union

def get_mean_attn_score(output_ids) -> Tuple[torch.Tensor, int, int]:
    r"""
    get the mean attention weights of the prefilling and full attention
    Args:
        output_ids: the output ids of the model
    Returns:
        mean_attn: the mean attention weights of the prefilling and full attention, shape: (L, L)
    """
    output_attn = output_ids.attentions
    pref_len = output_attn[0][0].shape[3]
    full_len = output_attn[-1][0].shape[3]
    prefill_attn = output_attn[0]
    assert prefill_attn[0].shape[0] == 1, "batch size should be 1"
    full_attn = []

    for l, layer in enumerate(prefill_attn):
        layer = layer.cpu().squeeze(0).float()
        layer = torch.nn.functional.pad(layer, (0, full_len - pref_len, 0, full_len - pref_len))
        for i in range(full_len - pref_len):
            cur_attn = output_attn[i + 1][l].cpu().squeeze(0)[:, 0, :].float()
            layer[:, pref_len + i, :pref_len + i + 1] = cur_attn
        full_attn.append(layer)
    mean_attn = torch.stack(full_attn).mean(dim=(0, 1))
    return mean_attn, pref_len, full_len


aw, pref_len, full_len = get_mean_attn_score(outputs)


In [43]:
aw.shape, pref_len, full_len


(torch.Size([711, 711]), 623, 711)

In [51]:
from typing import List, Optional, Tuple, Union


def get_visual_token_mean_attn_score_llava(mean_attn, input_ids, pref_len, visual_token_id=-200, image_token_count=576) -> List[torch.Tensor]:
    r"""
    Get the attention weights of the visual tokens
    Args:
        mean_attn: the mean attention weights of the prefilling and full attention, shape: (L, L)
        inputs: the inputs of the model
    Returns:
        visual_token_attn_weights: the tuple of the attention weights of the visual tokens, each element shape: (V, V)
    """
    assert input_ids.shape[0] == 1, "batch size should be 1"
    vision_start_token_indices = torch.where(
        input_ids[0] == visual_token_id
    )[0]
    visual_token_attn_weights = []
    for i, s in enumerate(vision_start_token_indices):
        print(s.item(), (s + i*image_token_count).item(), (s + (i+1)*image_token_count).item())
        visual_token_attn_weights.append(
            torch.mean(mean_attn[pref_len:, s + i*image_token_count : s + (i+1)*image_token_count], dim=0)
        )
    return visual_token_attn_weights


vw = get_visual_token_mean_attn_score_llava(aw, input_ids, pref_len)


35 35 611


In [47]:
vw[0].shape


torch.Size([576])