In [1]:
import torch
import gradio as gr
from gradio.themes.utils import colors, fonts, sizes

from semeval.experiments.kosenko.ask_anything.video_chat2.conversation import Chat

# videochat
from semeval.experiments.kosenko.ask_anything.video_chat2.utils.config import Config
from semeval.experiments.kosenko.ask_anything.video_chat2.utils.easydict import EasyDict
from semeval.experiments.kosenko.ask_anything.video_chat2.models.videochat2_it import (
    VideoChat2_it,
)
from peft import get_peft_model, LoraConfig, TaskType

  from .autonotebook import tqdm as notebook_tqdm
Matplotlib created a temporary cache directory at /tmp/matplotlib-vwmhkh0b because the default path (/home/user-name-goes-here/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def init_model():
    print("Initializing VideoChat")
    config_file = (
        "/code/semeval/experiments/kosenko/ask_anything/video_chat2/configs/config.json"
    )
    cfg = Config.from_file(config_file)
    cfg.model.vision_encoder.num_frames = 4
    # cfg.model.videochat2_model_path = ""
    # cfg.model.debug = True
    model = VideoChat2_it(config=cfg.model)
    model = model.to(torch.device(cfg.device))

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.0,
    )
    model.llama_model = get_peft_model(model.llama_model, peft_config)
    # state_dict = torch.load("your_model_path/videochat2_7b_stage3.pth", "cpu")
    videochat2_model_path = "/code/semeval/experiments/kosenko/ask_anything/video_chat2/videochat2_7b_stage3.pth"
    state_dict = torch.load(videochat2_model_path, "cpu")
    if "model" in state_dict.keys():
        msg = model.load_state_dict(state_dict["model"], strict=False)
    else:
        msg = model.load_state_dict(state_dict, strict=False)
    print(msg)
    model = model.eval()

    chat = Chat(model)
    print("Initialization Finished")
    return chat


chat = init_model()

Initializing VideoChat


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.59s/it]


_IncompatibleKeys(missing_keys=['llama_model.base_model.model.model.embed_tokens.weight', 'llama_model.base_model.model.model.layers.0.self_attn.q_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.k_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.v_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.o_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.gate_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.down_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.up_proj.weight', 'llama_model.base_model.model.model.layers.0.input_layernorm.weight', 'llama_model.base_model.model.model.layers.0.post_attention_layernorm.weight', 'llama_model.base_model.model.model.layers.1.self_attn.q_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.k_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.v_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.o_proj.

In [3]:
with torch.no_grad():
    image = "semeval/experiments/kosenko/ask_anything/video_chat2/example/yoga.mp4"
    conv = EasyDict(
        {"system": "", "roles": ["Human", "Assistant"], "messages": [], "sep": "###"}
    )
    img_list = []
    num_segments = 8
    llm_message, img_list, chat_state = chat.upload_video(
        image=image,
        conv=conv,
        img_list=img_list,
        num_segments=8,
    )


conv = EasyDict(
    {
        "system": "",
        "roles": ["Human", "Assistant"],
        "messages": [
            ["Human", "<Video><VideoHere></Video>\n"],
            ["Human", "Describe the following image in details.\n"],
            # ["Assistant", None],
        ],
        "sep": "###",
    }
)
max_new_tokens = 200
num_beams = 1
min_length = 1
top_p = 0.9
repetition_penalty = 1.0
length_penalty = 1
temperature = 1.0

llm_message, llm_message_token, chat_state = chat.answer(
    conv=conv,
    img_list=img_list,
    max_new_tokens=1000,
    num_beams=1,
    temperature=1.0,
)

llm_message = llm_message.replace("<s>", "")  # handle <s>
print(f"Answer: {llm_message}")

Input video shape: torch.Size([24, 224, 224])
n_position: 1568
pre_n_position: 784
Pretraining uses 4 frames, but current frame is 8
Interpolate the position embedding


  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


Answer: The image shows a lady doing yoga on the rooftop of a house, surrounded by mountains and the ocean. She is wearing a black tank top and performing different yoga postures, including forward bending and warrior pose. The lady seems to be enjoying the beautiful view while practicing yoga in the fresh air.
