In [1]:
import os
import sys
import torch
import pickle
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Optional, Union
from tqdm import tqdm
import torch.nn.functional as F
from PIL import Image
import json
from tqdm import tqdm

# Add current directory to path
sys.path.append('models/LLaVA')

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
)


disable_torch_init()

model_name = get_model_name_from_path("liuhaotian/llava-v1.5-7b")
print(f"Loading model: {model_name}")

tokenizer, model, image_processor, context_len = load_pretrained_model(
    "liuhaotian/llava-v1.5-7b",
    None,
    model_name
    )

  from .autonotebook import tqdm as notebook_tqdm


Loading model: llava-v1.5-7b


You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]


In [3]:
with open('data/test.jsonl','r') as f:
    lines = f.readlines()
    
line = lines[1]
data = json.loads(line)
path = '/home/user/khoihm/val2014/' + data['image']

image = Image.open(path).convert("RGB")
image_tensor = process_images([image], image_processor, model.config)
image_tensor = image_tensor.to(model.device, dtype=torch.float16)
image_sizes = [image.size]
query = "Describe this image."

qs = DEFAULT_IMAGE_TOKEN + "\n" + query

# Create conversation and format prompt
conv = conv_templates["llava_v1"].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

# Tokenize input
input_ids = tokenizer_image_token(
    prompt, 
    tokenizer, 
    IMAGE_TOKEN_INDEX, 
    return_tensors="pt",
).unsqueeze(0).to(model.device)
# Generate response with attention
with torch.inference_mode():
    outputs = model.generate(
        input_ids,
        images=image_tensor.unsqueeze(0),
        image_sizes=image_sizes,
        do_sample=False,
        output_scores = True,
        output_attentions = True,
        return_dict_in_generate = True,
        max_new_tokens=2048
    )
    
print(tokenizer.decode(outputs.sequences[0], skip_special_tokens=False))

<s> The image features a man wearing a red jacket and ski gear, standing on a snow-covered slope. He is holding a pair of skis and appears to be preparing to ski down the hill. The man is positioned in the center of the scene, with the skis held in his hands.

In the background, there are a few other people scattered around the slope, possibly enjoying the winter sports activities as well. The scene captures the excitement and fun of skiing on a sunny day.</s>


In [4]:
prefix = 'The image features a man wearing a red jacket and ski gear, standing on a snow-covered slope. In'

prefix = tokenizer.encode(prefix, return_tensors="pt").to(model.device)[0]
query = "Describe this image."

qs = DEFAULT_IMAGE_TOKEN + "\n" + query

# Create conversation and format prompt
conv = conv_templates["llava_v1"].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

# Tokenize input
input_ids = tokenizer_image_token(
    prompt, 
    tokenizer, 
    IMAGE_TOKEN_INDEX, 
    return_tensors="pt",
).unsqueeze(0).to(model.device)

with torch.inference_mode():
    new_outputs = model.generate_with_prefix(
        input_ids,
        images=image_tensor.unsqueeze(0),
        prefix = prefix,
        image_sizes=image_sizes,
        max_new_tokens=2048
    )
print(tokenizer.decode(new_outputs[0],skip_special_tokens=False))


<s> The image features a man wearing a red jacket and ski gear, standing on a snow-covered slope. In his hand, he holds a pair of skis, ready to ski down the hill. The man appears to be enjoying his time on the snow-covered slope, possibly taking a break or preparing to continue skiing.

There are a few other people in the scene, but they are not the main focus of the image. The main subject is the man in the red jacket, who is the center of attention as he stands on his skis and looks out over the snowy landscape.</s>
