In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import torch
from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)
from semeval.experiments.kosenko.language_bind.languagebind_classification_video_text import (
    CauseVideoTextClassif,
)


all_emotions = [
    "surprise",
    "fear",
    "sadness",
    "neutral",
    "joy",
    "anger",
    "disgust",
]

emotions2labels = {em: i for i, em in enumerate(all_emotions)}

labels2emotions = {i: em for i, em in enumerate(all_emotions)}

clip_type = {
    "video": "LanguageBind_Video_FT",
}

model = CauseVideoTextClassif(
    labels=len(all_emotions),
    clip_type=clip_type,
)


model.load_state_dict(
    torch.load(
        "semeval/experiments/kosenko/language_bind/train_results/exp_9_checkpoint-6703/pytorch_model.bin"
    )
)

pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](model.model.modality_config[c]) for c in clip_type.keys()
}

device = "cuda:0"
device = torch.device(device)
model = model.to(device)
model = model.half()

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [3]:
video = [
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
]
language = [
    "Two pandas are eating bamboo.",
]

custom_inputs = {
    "initial_video": to_device(
        modality_transform["video"](video),
        device,
    ),
    "cause_video": to_device(
        modality_transform["video"](video),
        device,
    ),
    "initial_language": to_device(
        tokenizer(
            language,
            max_length=77,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ),
        device,
    ),
    "cause_language": to_device(
        tokenizer(
            language,
            max_length=77,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ),
        device,
    ),
}
model.eval()
with torch.no_grad():
    with torch.autocast(device_type="cuda"):
        result = model(custom_inputs)
result[0].argmax(-1).item(), result[1].argmax(-1).item()

(3, 0)

### predict from test

In [4]:
from datasets import load_dataset

dataset = load_dataset("dim/semeval_subtask2_conversations")
dataset = dataset["test"]

In [5]:
dataset[0]

{'conversation_ID': 1231,
 'conversation': [{'emotion': 'neutral',
   'speaker': 'Phoebe',
   'text': 'No .',
   'utterance_ID': 1,
   'video_name': 'dia1231utt1.mp4'},
  {'emotion': 'anger',
   'speaker': 'Phoebe',
   'text': 'No !',
   'utterance_ID': 2,
   'video_name': 'dia1231utt2.mp4'},
  {'emotion': 'joy',
   'speaker': 'Phoebe',
   'text': 'Oh , would you look at that Monica ?',
   'utterance_ID': 3,
   'video_name': 'dia1231utt3.mp4'},
  {'emotion': 'joy',
   'speaker': 'Phoebe',
   'text': 'I just knocked off all of your top scores , how sad .',
   'utterance_ID': 4,
   'video_name': 'dia1231utt4.mp4'},
  {'emotion': 'anger',
   'speaker': 'Monica',
   'text': 'Okay , I am next . Do not ! Do not start another game ! I said I am next ! Phoebe !',
   'utterance_ID': 5,
   'video_name': 'dia1231utt5.mp4'},
  {'emotion': 'joy',
   'speaker': 'Phoebe',
   'text': 'Oh , I am sorry . I did not hear you over all the winning .',
   'utterance_ID': 6,
   'video_name': 'dia1231utt6.mp4'

In [6]:
for conv in dataset:
    conversation = conv["conversation"]
    for i in range(len(conversation)):
        item_i = conversation[i]
        for j in range(len(conversation)):
            item_j = conversation[j]
            base_path = "/code/SemEval-2024_Task3/training_data/train"
            initial_video = [
                f'{base_path}/{item_i["video_name"]}',
            ]
            cause_video = [
                f'{base_path}/{item_j["video_name"]}',
            ]
            initial_language = [
                item_i["text"],
            ]
            cause_language = [
                item_j["text"],
            ]

            custom_inputs = {
                "initial_video": to_device(
                    modality_transform["video"](video),
                    device,
                ),
                "cause_video": to_device(
                    modality_transform["video"](video),
                    device,
                ),
                "initial_language": to_device(
                    tokenizer(
                        initial_language,
                        max_length=77,
                        padding="max_length",
                        truncation=True,
                        return_tensors="pt",
                    ),
                    device,
                ),
                "cause_language": to_device(
                    tokenizer(
                        cause_language,
                        max_length=77,
                        padding="max_length",
                        truncation=True,
                        return_tensors="pt",
                    ),
                    device,
                ),
            }
            model.eval()
            with torch.no_grad():
                with torch.autocast(device_type="cuda"):
                    result = model(custom_inputs)
            emotion = result[0].argmax(-1).item()
            emotion = labels2emotions[emotion]
            if emotion != "neutral":
                cause_or_not = result[1].argmax(-1).item()
                if cause_or_not == 1:
                    print(i + 1, emotion, j + 1, cause_or_not)

    break

1 sadness 6 1
1 sadness 7 1
2 anger 1 1
2 anger 3 1
2 anger 4 1
2 anger 5 1
2 anger 6 1
2 anger 7 1
3 surprise 1 1
3 surprise 2 1
3 surprise 3 1
3 surprise 4 1
3 surprise 5 1
3 surprise 6 1
3 surprise 7 1
5 anger 1 1
5 anger 3 1
5 anger 4 1
5 anger 5 1
5 anger 6 1
5 anger 7 1
6 anger 1 1
6 anger 2 1
6 anger 3 1
6 anger 4 1
6 anger 5 1
6 anger 6 1
6 anger 7 1
7 anger 3 1
7 anger 5 1
7 anger 6 1
7 anger 7 1


In [None]:
['3_joy', '4'],
['4_joy', '4'],
['5_anger', '5'],
['6_joy', '4'],
['6_joy', '6'],
['7_anger', '7']]