In [1]:
import torch
import gradio as gr
from gradio.themes.utils import colors, fonts, sizes

from semeval.experiments.kosenko.ask_anything.video_chat2.conversation import Chat

# videochat
from semeval.experiments.kosenko.ask_anything.video_chat2.utils.config import Config
from semeval.experiments.kosenko.ask_anything.video_chat2.utils.easydict import EasyDict
from semeval.experiments.kosenko.ask_anything.video_chat2.models.videochat2_it import (
    VideoChat2_it,
)
from peft import get_peft_model, LoraConfig, TaskType

  from .autonotebook import tqdm as notebook_tqdm
Matplotlib created a temporary cache directory at /tmp/matplotlib-f1_af4h3 because the default path (/home/user-name-goes-here/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def init_model():
    print("Initializing VideoChat")
    config_file = (
        "/code/semeval/experiments/kosenko/ask_anything/video_chat2/configs/config.json"
    )
    cfg = Config.from_file(config_file)
    cfg.model.vision_encoder.num_frames = 4
    # cfg.model.videochat2_model_path = ""
    # cfg.model.debug = True
    model = VideoChat2_it(config=cfg.model)
    model = model.to(torch.device(cfg.device))

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.0,
    )
    model.llama_model = get_peft_model(model.llama_model, peft_config)
    # state_dict = torch.load("your_model_path/videochat2_7b_stage3.pth", "cpu")
    videochat2_model_path = "/code/semeval/experiments/kosenko/ask_anything/video_chat2/videochat2_7b_stage3.pth"
    state_dict = torch.load(videochat2_model_path, "cpu")
    if "model" in state_dict.keys():
        msg = model.load_state_dict(state_dict["model"], strict=False)
    else:
        msg = model.load_state_dict(state_dict, strict=False)
    print(msg)
    model = model.eval()

    chat = Chat(model)
    print("Initialization Finished")
    return chat


chat = init_model()

Initializing VideoChat


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.29s/it]


_IncompatibleKeys(missing_keys=['llama_model.base_model.model.model.embed_tokens.weight', 'llama_model.base_model.model.model.layers.0.self_attn.q_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.k_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.v_proj.weight', 'llama_model.base_model.model.model.layers.0.self_attn.o_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.gate_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.down_proj.weight', 'llama_model.base_model.model.model.layers.0.mlp.up_proj.weight', 'llama_model.base_model.model.model.layers.0.input_layernorm.weight', 'llama_model.base_model.model.model.layers.0.post_attention_layernorm.weight', 'llama_model.base_model.model.model.layers.1.self_attn.q_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.k_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.v_proj.weight', 'llama_model.base_model.model.model.layers.1.self_attn.o_proj.

In [5]:
import json

dataset = json.loads(
    open("./SemEval-2024_Task3/training_data/Subtask_2_train.json").read()
)

base_video_path = "/code/SemEval-2024_Task3/training_data/train"
video_paths = [
    f"{base_video_path}/{item['video_name']}" for item in dataset[10]["conversation"]
]

with torch.no_grad():

    # image = "semeval/experiments/kosenko/ask_anything/video_chat2/example/yoga.mp4"

    image = (

        "/code/semeval/experiments/kosenko/ask_anything/example/hitting_baseball.mp4"

    )

    conv = EasyDict(

        {"system": "", "roles": ["Human", "Assistant"], "messages": [], "sep": "###"}

    )

    img_list = []

    num_segments = 32

    llm_message, img_list, chat_state = chat.upload_video(

        # image=image,
        image=video_paths,
        conv=conv,
        img_list=img_list,

        num_segments=num_segments,
        video_prompt="Watch the video and predict emotion of last speaker.",

    )

Input video shape: torch.Size([96, 224, 224])
n_position: 6272
pre_n_position: 784
Pretraining uses 4 frames, but current frame is 32
Interpolate the position embedding


In [4]:
dataset[10]

{'conversation_ID': 11,
 'conversation': [{'utterance_ID': 1,
   'text': 'I mean , why should I let them meet him ? I mean , I bring a guy home , and within five minutes they are all over him . I mean , they are like ... coyotes , picking off the weak members of the herd .',
   'speaker': 'Monica',
   'emotion': 'disgust',
   'video_name': 'dia11utt1.mp4'},
  {'utterance_ID': 2,
   'text': 'Listen . As someone who seen more than her fair share of bad beef , I will tell you : that is not such a terrible thing .',
   'speaker': 'Paula',
   'emotion': 'neutral',
   'video_name': 'dia11utt2.mp4'},
  {'utterance_ID': 3,
   'text': 'Come on , they are your friends , they are just looking out after you .',
   'speaker': 'Paula',
   'emotion': 'neutral',
   'video_name': 'dia11utt3.mp4'},
  {'utterance_ID': 4,
   'text': 'I know . I just wish that once , I would bring a guy home that they actually liked .',
   'speaker': 'Monica',
   'emotion': 'sadness',
   'video_name': 'dia11utt4.mp4'},
  {

In [17]:
dialog_example = [
    f"""Speaker: {item['speaker']}
Text: {item['text']}
Emotion: {item['emotion']}"""
    for item in dataset[11]["conversation"]
]
# golden_true = dialog_example.pop()
dialog_example = "\n".join(dialog_example)
print(dialog_example)

Speaker: Joey
Text: Let it go , Ross .
Emotion: neutral
Speaker: Ross
Text: Yeah , well , you did not know Chi Chi .
Emotion: anger
Speaker: Monica
Text: Do you all promise ?
Emotion: neutral
Speaker: All
Text: Yeah ! We promise ! We will be good !
Emotion: neutral
Speaker: Monica
Text: Chandler ? Do you promise to be good ?
Emotion: neutral
Speaker: Joey
Text: You can come in , but your filter ... tipped little buddy has to stay outside !
Emotion: joy
Speaker: Ross
Text: Hey , Pheebs .
Emotion: neutral
Speaker: Phoebe
Text: Dear Ms . Buffay . Thank you for calling attention to our error . We have credited your account with five hundred dollars .
Emotion: neutral
Speaker: Phoebe
Text: We are sorry for the inconvenience , and hope you will accept this football phone ... as our free gift . Do you believe this ? ! Now I have a thousand dollars , and a football phone !
Emotion: surprise


In [25]:
img_list[0].shape

torch.Size([1, 96, 4096])

In [28]:
conv = EasyDict(
    {
        "system": "",
        "roles": ["Human", "Assistant"],
        "messages": [
            ["Human", "<Video><VideoHere></Video>\n"],
            [
                "Human",
                """
Speaker: Joey
Text: Let it go , Ross .
Emotion: neutral
Speaker: Ross
Text: Yeah , well , you did not know Chi Chi .
Position: 2
Emotion: anger
Text: Do you all promise ?
Emotion: neutral
Speaker: All
Text: Yeah ! We promise ! We will be good !
Emotion: neutral
Speaker: Monica
Text: Chandler ? Do you promise to be good ?
Emotion: neutral
Speaker: Joey
Text: You can come in , but your filter ... tipped little buddy has to stay outside !
---
Speaker: Joey
Text: Let it go , Ross .
Emotion: neutral
CAUSE: 1 2
Predict last emotion based on last "Text" of speaker "Joey". 
Select from this list: surprise, fear, sadness, neutral, joy, anger, disgust. 
""",
                # Emotion: joy
                # Let's think step by step.
            ],
            # ["Assistant", None],
        ],
        "sep": "###",
    }
)
max_new_tokens = 200
num_beams = 1
min_length = 1
top_p = 0.9
repetition_penalty = 1.0
length_penalty = 1
temperature = 1.0

llm_message, llm_message_token, chat_state = chat.answer(
    conv=conv,
    img_list=img_list,
    max_new_tokens=1000,
    num_beams=1,
    temperature=1.0,
)

llm_message = llm_message.replace("<s>", "")  # handle <s>
print(f"Answer: {llm_message}")

Answer: The last "Text" of the speaker "Joey" is "You can come in, but your filter ... tipped little buddy has to stay outside !". Based on this, the last emotion can be disgust.


In [14]:
llm_message

'The next emotion predicted based on the last text is disgust.'

### simple forward with freeze img tokens

In [None]:
conv = EasyDict(
    {
        "system": "",
        "roles": ["Human", "Assistant"],
        "messages": [
            ["Human", "<Video><VideoHere></Video>\n"],
            ["Human", "Describe the following image in details and say hello.\n"],
            # ["Assistant", None],
        ],
        "sep": "###",
    }
)

conv.messages.append([conv.roles[1], None])
embs = chat.get_context_emb(conv, img_list)
outputs = chat.model.llama_model(
    inputs_embeds=embs,
)

In [29]:
embs.shape

torch.Size([1, 130, 4096])

In [8]:
outputs.keys()

odict_keys(['logits', 'past_key_values'])

In [30]:
4096 / 96

42.666666666666664

### Test with friends

In [18]:
from datasets import load_dataset
from torchvision.io import read_video
import json
import torch
import os

Speaker: Monica
Text: I mean , why should I let them meet him ? I mean , I bring a guy home , and within five minutes they are all over him . I mean , they are like ... coyotes , picking off the weak members of the herd .
Emotion: disgust
Speaker: Paula
Text: Listen . As someone who seen more than her fair share of bad beef , I will tell you : that is not such a terrible thing .
Emotion: neutral
Speaker: Paula
Text: Come on , they are your friends , they are just looking out after you .
Emotion: neutral
Speaker: Monica
Text: I know . I just wish that once , I would bring a guy home that they actually liked .
Emotion: sadness
Speaker: Paula
Text: Well , you do realise the odds of that happening are a little slimmer if they never get to meet the guy ...
Emotion: neutral


In [22]:
from PIL import Image

import torch
from transformers import StoppingCriteria, StoppingCriteriaList

from enum import auto, Enum

import numpy as np
from decord import VideoReader, cpu
import torchvision.transforms as T
from semeval.experiments.kosenko.ask_anything.video_chat2.dataset.video_transforms import (
    GroupNormalize,
    GroupScale,
    GroupCenterCrop,
    Stack,
    ToTorchFormatTensor,
)
from torchvision.transforms.functional import InterpolationMode
from torchvision.io import read_video, write_video

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# def concat_videos(video_paths):
#     all_videos = []
#     avg_fps_total = []
#     for video_path in video_paths:
#         vr = VideoReader(video_path, ctx=cpu(0))
#         all_video = vr.get_batch(list(range(len(vr)))).numpy()
#         all_videos.append(all_video)
#         avg_fps_total.append(vr.get_avg_fps())

#     all_videos = np.concatenate(all_videos)
#     avg_fps_total = np.mean(avg_fps_total)
#     write_video(
#         "./test.mp4",
#         torch.tensor(all_videos),
#         fps=avg_fps_total,
#     )


# item = concat_videos(
#     video_paths=video_paths,
# )
video_paths

['/code/SemEval-2024_Task3/training_data/train/dia1utt1.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt2.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt3.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt4.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt5.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt6.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt7.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt8.mp4']

['/code/SemEval-2024_Task3/training_data/train/dia1utt1.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt2.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt3.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt4.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt5.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt6.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt7.mp4',
 '/code/SemEval-2024_Task3/training_data/train/dia1utt8.mp4']

In [None]:
task = (
"Your task is an emotion classification and cause recognition. "
"Predict the emotion of the speaker in each utterance and the ids of "
"utterances that caused the speaker's emotion in each utterance, then "
"place the target utterances and its emotions as sorted keys and a sorted list of ids of "
"causal utterances as values in an output JSON payload. Below is an example: "
'Input: Rachel: "Mom, would you relax.", Rachel: "That was 10 blocks from here and, '
'the, the woman was walking alone at night, I would never do that.", Rachel: "Mom, c mon, '
'stop worrying.". Output: {"1_neutral": [], "2_sadness": [3], "3_sadness": [3]}. Now, complete the task. Input: '
input_text
" Output:"
)