In [3]:
import argparse
import os
import random
from collections import defaultdict

import cv2
import re

import numpy as np
from PIL import Image
import torch
import html
import gradio as gr

import torchvision.transforms as T
import torch.backends.cudnn as cudnn

from minigpt4.common.config import Config

from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat

# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
from minigpt4.conversation.conversation import CONV_VISION_minigptv2

In [4]:


args_dict = {
    "cfg-path": 'eval_configs/minigptv2_eval.yaml',
    "gpu-id": 0,
    "options": None
}
parser = argparse.ArgumentParser(description="Demo")

for arg_name, arg_value in args_dict.items():
    parser.add_argument(f"--{arg_name}", default=arg_value)
    
args = parser.parse_args([])

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

cudnn.benchmark = False
cudnn.deterministic = True

print('Initializing Chat')
# args = parse_args()
cfg = Config(args)

Initializing Chat


In [55]:
cfg

<minigpt4.common.config.Config at 0x7fdb520f3910>

In [5]:
device = 'cuda:{}'.format(args.gpu_id)

model_config = cfg.model_cfg
# model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config)

***********************
LLAMA MODEL PATH: models/Meta-Llama-3-8B-Instruct


Some weights of the model checkpoint at models/Meta-Llama-3-8B-Instruct were not used when initializing LlamaForCausalLM: ['layers.0.attention.wv.weight', 'layers.1.attention.wk.weight', 'layers.14.attention.wq.weight', 'layers.0.feed_forward.w1.weight', 'layers.9.attention.wq.weight', 'layers.21.attention.wk.weight', 'layers.8.attention_norm.weight', 'layers.10.feed_forward.w3.weight', 'layers.7.attention.wo.weight', 'layers.16.attention.wv.weight', 'layers.19.attention.wq.weight', 'layers.28.feed_forward.w1.weight', 'layers.0.feed_forward.w3.weight', 'layers.23.ffn_norm.weight', 'layers.30.attention.wo.weight', 'layers.27.attention.wk.weight', 'layers.20.ffn_norm.weight', 'layers.5.attention.wo.weight', 'layers.3.ffn_norm.weight', 'layers.13.feed_forward.w2.weight', 'layers.4.attention_norm.weight', 'layers.31.attention_norm.weight', 'layers.16.attention.wo.weight', 'layers.29.ffn_norm.weight', 'layers.14.feed_forward.w3.weight', 'layers.27.attention.wq.weight', 'layers.27.feed_forwa

trainable params: 33554432 || all params: 8869122048 || trainable%: 0.3783286758080702
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL HIDDEN DIM: 6144
LLAMA MODEL DIM: 1408
LLAMA 3 MODEL HIDDEN DIM: 6144
LLAMA MODEL

In [7]:
model = model.to(device)

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
#['blip2_image_eval', 'blip2_image_train', 'blip_caption']
vis_processor = registry.get_processor_class('blip2_image_train').from_config(vis_processor_cfg)

model = model.eval()

# CONV_VISION = Conversation(
#     system="",
#     roles=(r"<s>[INST] ", r" [/INST]"),
#     messages=[],
#     offset=2,
#     sep_style=SeparatorStyle.SINGLE,
#     sep="",
# )

In [60]:
registry.list_processors()

['blip2_image_eval', 'blip2_image_train', 'blip_caption']

In [61]:
vis_processor_cfg

{'name': 'blip2_image_eval', 'image_size': 448}

In [62]:
torch.cuda.empty_cache()

In [33]:
cfg.datasets_cfg.cc_sbu_align

{'data_type': 'images', 'build_info': {'storage': '/work/pi_donghyunkim_umass_edu/hochul/emnlp/kthk/MiniGPT-4/minigpt4/configs/datasets/cc_sbu_align/'}, 'vis_processor': {'train': {'name': 'blip2_image_eval', 'image_size': 448}}, 'text_processor': {'train': {'name': 'blip_caption'}}}

In [64]:
model

MiniGPTv2(
  (llama_model): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(128256, 4096, padding_idx=0)
          (layers): ModuleList(
            (0-31): 32 x LlamaDecoderLayer(
              (self_attn): LlamaAttention(
                (q_proj): Linear8bitLt(
                  in_features=4096, out_features=4096, bias=False
                  (lora_dropout): Dropout(p=0.05, inplace=False)
                  (lora_A): Linear(in_features=4096, out_features=64, bias=False)
                  (lora_B): Linear(in_features=64, out_features=4096, bias=False)
                )
                (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (v_proj): Linear8bitLt(
                  in_features=4096, out_features=4096, bias=False
                  (lora_dropout): Dropout(p=0.05, inplace=False)
                  (lora_A): Linear(in_features=4096, out_featur

In [65]:
image_path = './dataset/dog_rb/images_high/0_01-garcia-stitched_10382.jpg'
image_path = './examples_v2/float.png'
image = Image.open(image_path).convert('RGB')
image2 = Image.open(image_path).convert('RGB')
image = vis_processor(image)
question = "what"
question = f"Describe the scene"

In [53]:
image.shape

torch.Size([3, 448, 448])

In [66]:
image = image.unsqueeze(0)

In [67]:
def prepare_texts(texts, conv_temp):
    convs = [conv_temp.copy() for _ in range(len(texts))]
    [conv.append_message(
        conv.roles[0], '<Img><ImageHere></Img> {}'.format(text)) for conv, text in zip(convs, texts)]
    [conv.append_message(conv.roles[1], None) for conv in convs]
    texts = [conv.get_prompt() for conv in convs]
    return texts
conv_temp = CONV_VISION_minigptv2.copy()
# conv_temp.system = ""

In [68]:
text = prepare_texts([question], conv_temp)

In [78]:
test = f"<Img><ImageHere></Img> [vqa] Based on the image, respond to this question with a short answer: Are there any cars nearby?'"

In [8]:
model.generate(image, text, max_new_tokens=100, do_sample=False,temperature = 1,
        top_p = 0.9)