In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import torch
from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)


all_emotions = [
    "surprise",
    "fear",
    "sadness",
    "neutral",
    "joy",
    "anger",
    "disgust",
]

emotions2labels = {em: i for i, em in enumerate(all_emotions)}

labels2emotions = {i: em for i, em in enumerate(all_emotions)}

clip_type = {
    "video": "LanguageBind_Video_FT",
}

model = LanguageBind(
    clip_type=clip_type,
    cache_dir="/code/cache_dir",
)
pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](model.modality_config[c]) for c in clip_type.keys()
}

device = "cuda:0"
device = torch.device(device)
model = model.to(device)
model = model.half()

video = [
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
]


language = [
    "A lion climbing a tree to catch a monkey.",
    "Training a parrot to climb up a ladder.",
    "Cute parrot is sitting on the floor.",
    "Parrot climbs the small stairs.",
    "Two pandas are eating bamboo.",
]



inputs = {
    "video": to_device(modality_transform["video"](video), device),
}


inputs["language"] = to_device(
    tokenizer(
        language,

        max_length=77,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ),

    device,
)



with torch.no_grad():
    with torch.autocast(device_type="cuda"):
        embeddings = model(inputs)



probs = torch.softmax(embeddings["video"] @ embeddings["language"].T, dim=-1)
sorted_indices = probs.topk(k=len(language)).indices.squeeze().tolist()
predicted_texts = "\n--\n".join([language[pos] for pos in sorted_indices])


print("Video x Text: \n", probs)
print(f"Most similar text is: \n{predicted_texts}")

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


Video x Text: 
 tensor([[1.3242e-07, 6.8035e-01, 1.0337e-03, 3.1862e-01, 3.9172e-09]],
       device='cuda:0')
Most similar text is: 
Training a parrot to climb up a ladder.
--
Parrot climbs the small stairs.
--
Cute parrot is sitting on the floor.
--
A lion climbing a tree to catch a monkey.
--
Two pandas are eating bamboo.


Add language embeddings to bert

In [4]:
# from semeval.experiments.kosenko.language_bind.custom_bert import BertModel
# from transformers import BertTokenizer, AutoConfig


# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# config = AutoConfig.from_pretrained("bert-base-uncased")
# model = BertModel._from_config(config)
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors="pt")
# output = model(**encoded_input)
# # output

In [2]:
# from semeval.experiments.kosenko.language_bind.custom_bert import (
#     BertForCauseAnswering,
# )
# from transformers import BertTokenizer, AutoConfig

# # model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
# bert_model = BertForCauseAnswering._from_config(
#     AutoConfig.from_pretrained("bert-base-uncased")
# )
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

[BOS, U_1, U_2, U_3, ..., U_d, SEP, U_t, EOS]

- BOS - вектор начала строки (в нашем случае он не имеет никакого отношения к оригинальному берт, инициализируется рандомно)
- U_1 - вектор первой реплики
- d - общее количество реплик (вроде как максимум тут 35)
- SEP - разделительный вектор
- U_t - вектор интересующей нас реплики для которой мы хотим предсказать причину
- EOS - вектор конца строки

Далее каждый вектор пропускается через линейный слой 768х2 и происходит бинарная классификация каждого токена. 0 это значит данная реплика не является причиной, 1 - является.

### Бейзлайн, который будет использовать архитектуру bert

[BOS, U_1, U_2, U_3, ..., U_d, SEP, U_t, SEP, U_c, EOS]

- BOS - вектор начала строки (в нашем случае он не имеет никакого отношения к оригинальному берт, инициализируется рандомно)
- U_1 - вектор первой реплики
- d - общее количество реплик (вроде как максимум тут 35)
- SEP - разделительный вектор
- U_t - вектор интересующей нас реплики для которой мы хотим предсказать причины
- U_c - реплика которая является причиной. Для отрицательных примеров вставляем ту, которая не является. 
- EOS - вектор конца строки

Далее каждый вектор пропускается через линейный слой 768х1 и далее как в задаче squad мы предказываем позицию причины. В squad мы берем линейный слой 768х2, потом разрезаем получившийся вектор на 2 и потом предсказываем при помощи кросс энтропии позицию(класс) в строке.

подобно тому как это сделано в bert for question answering
```python
outputs = self.bert(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
)

sequence_output = outputs[0]
# self.qa_outputs = nn.Linear(768, 2)
logits = self.qa_outputs(sequence_output)
# logits.shape = [batch, seq_len, 2]
start_logits, end_logits = logits.split(1, dim=-1)
# start_logits.shape = [batch, seq_len, 1]
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
```

In [22]:
import random
from datasets import load_dataset

dataset = load_dataset("dim/semeval_subtask2_conversations")


# dataset = dataset["train"]
# dataset
def get_bert_cause_dataset(dataset):
    new_dataset = []

    for item in dataset:
        conversation = item["conversation"]
        # print(item)
        positive_pairs = []

        for cause in item["emotion-cause_pairs"]:
            emotion_pos = int(cause[0].split("_")[0]) - 1
            emotion = cause[0].split("_")[1]
            cause_pos = int(cause[1]) - 1
            # print(emotion_pos, emotion, cause_pos, cause)
            new_dataset.append(
                {
                    "conversation": conversation,
                    "emotion_pos": emotion_pos,
                    "cause_pos": cause_pos,
                    "emotion_label": emotion,
                    "label": cause_pos,
                }
            )
            positive_pairs.append((emotion_pos, cause_pos))

        positive_pairs = set(positive_pairs)
        negative_pairs = []
        for pos_i in range(len(conversation)):
            for pos_j in range(len(conversation)):
                pair = (pos_i, pos_j)
                if not pair in positive_pairs:
                    negative_pairs.append(pair)

        if len(negative_pairs) > len(positive_pairs):
            negative_pairs = random.sample(negative_pairs, len(positive_pairs))

        for pair in negative_pairs:
            emotion = conversation[pair[0]]["emotion"]
            new_dataset.append(
                {
                    "conversation": conversation,
                    "emotion_pos": pair[0],
                    "cause_pos": pair[1],
                    "emotion_label": emotion,
                    "label": -1,
                }
            )

    return new_dataset


dataset_train = get_bert_cause_dataset(dataset=dataset["train"])
dataset_test = get_bert_cause_dataset(dataset=dataset["test"])

example = dataset_train[0]

In [23]:
video = []
language = []
base_video_path = "/code/SemEval-2024_Task3/training_data/train"
for utterance in example["conversation"]:
    video_path = f"{base_video_path}/{utterance['video_name']}"
    language_utt = utterance["text"]
    video.append(video_path)
    language.append(language_utt)
    # print(video_path)
    # for utterance in item['conversation']:

inputs = {
    "video": to_device(
        modality_transform["video"](video),
        device,
    ),
    "language": to_device(
        tokenizer(
            language,
            max_length=77,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ),
        device,
    ),
}

with torch.no_grad():
    with torch.autocast(device_type="cuda"):
        embeddings = model(inputs)

embeddings["language"].shape,

(torch.Size([7, 768]),)

In [4]:
embeddings["language"].device

device(type='cuda', index=0)

In [5]:
example

{'conversation': [{'emotion': 'sadness',
   'speaker': 'Monica',
   'text': 'Mr . Heckles .',
   'utterance_ID': 1,
   'video_name': 'dia187utt1.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Rachel',
   'text': 'How did this happen ?',
   'utterance_ID': 2,
   'video_name': 'dia187utt2.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr. Treeger',
   'text': 'He musta been sweeping . They found a broom in his hand .',
   'utterance_ID': 3,
   'video_name': 'dia187utt3.mp4'},
  {'emotion': 'sadness',
   'speaker': 'Monica',
   'text': 'That is terrible .',
   'utterance_ID': 4,
   'video_name': 'dia187utt4.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr. Treeger',
   'text': 'I know . I was sweeping yesterday . It coulda been me .',
   'utterance_ID': 5,
   'video_name': 'dia187utt5.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Ross',
   'text': 'Sure , you coulda . You never know .',
   'utterance_ID': 6,
   'video_name': 'dia187utt6.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr.

In [17]:
import torch

new_vector = []
# PAD=0 CLS=1 SEP=2 BOS=4
PAD = 0
CLS = 1
SEP = 2
BOS = 3

special_embeddings = torch.nn.Embedding(
    4,
    768,
    padding_idx=0,
)
special_embeddings.cuda()

new_vector.append(
    special_embeddings(
        torch.tensor(
            [CLS],
            device="cuda",
        )
    )
)

for i in range(embeddings["video"].shape[0]):
    video_vec = embeddings["video"][i]
    lang_vec = embeddings["language"][i]
    new_vector.append(video_vec)
    new_vector.append(lang_vec)

new_vector.append(
    special_embeddings(
        torch.tensor(
            [SEP],
            device="cuda",
        )
    )
)

new_vector.append(
    embeddings["video"][example["emotion_pos"]],
)
new_vector.append(
    embeddings["language"][example["emotion_pos"]],
)

new_vector.append(
    special_embeddings(
        torch.tensor(
            [SEP],
            device="cuda",
        )
    )
)

new_vector.append(
    embeddings["video"][example["cause_pos"]],
)
new_vector.append(
    embeddings["language"][example["cause_pos"]],
)

new_vector.append(
    special_embeddings(
        torch.tensor(
            [BOS],
            device="cuda",
        )
    )
)

new_vector = torch.vstack(
    new_vector,
)
new_vector = new_vector.unsqueeze(0)
new_vector.shape
#

torch.Size([1, 22, 768])

In [8]:
new_vector.device

device(type='cuda', index=0)

In [9]:
from semeval.experiments.kosenko.language_bind.custom_bert import (
    BertForCauseAnswering,
)
from transformers import BertTokenizer, AutoConfig

# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
bert_model = BertForCauseAnswering._from_config(
    AutoConfig.from_pretrained("bert-base-uncased")
)
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [16]:
new_vector.shape

torch.Size([22, 768])

In [18]:
torch.cat([new_vector, new_vector], dim=0).shape

torch.Size([2, 22, 768])

In [20]:
bert_model.cuda()
output = bert_model(
    inputs_embeds=torch.cat(
        [new_vector, new_vector],
        dim=0,
    ),
)

output.start_logits.shape

torch.Size([2, 22])

In [None]:
import torch


class CausePredictor(torch.nn.Module):
    def __ini__(self, clip_type):
        super().__init__()
        self.bert_model = BertForCauseAnswering._from_config(
            AutoConfig.from_pretrained("bert-base-uncased")
        )
        self.languagebind = LanguageBind(
            clip_type=clip_type,
            cache_dir="/code/cache_dir",
        )

        self.PAD = 0
        self.CLS = 1
        self.SEP = 2
        self.BOS = 3

        self.special_embeddings = torch.nn.Embedding(
            4,
            768,
            padding_idx=0,
        )

    def get_embeddings_elem(self, dataset_item):
        inputs = {
            "video": to_device(
                modality_transform["video"](video),
                device,
            ),
            "language": to_device(
                tokenizer(
                    language,
                    max_length=77,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                ),
                device,
            ),
        }

        with torch.no_grad():
            with torch.autocast(device_type="cuda"):
                embeddings = model(inputs)

    def preprocess_elem(self, dataset_item):
        new_vector = []
        device = self.special_embeddings.device

        new_vector.append(
            self.special_embeddings(
                torch.tensor(
                    [self.CLS],
                    device=device,
                )
            )
        )

        for i in range(embeddings["video"].shape[0]):
            video_vec = embeddings["video"][i]
            lang_vec = embeddings["language"][i]
            new_vector.append(video_vec)
            new_vector.append(lang_vec)

        new_vector.append(
            special_embeddings(
                torch.tensor(
                    [self.SEP],
                    device=device,
                )
            )
        )

        new_vector.append(
            embeddings["video"][dataset_item["emotion_pos"]],
        )
        new_vector.append(
            embeddings["language"][dataset_item["emotion_pos"]],
        )

        new_vector.append(
            special_embeddings(
                torch.tensor(
                    [self.SEP],
                    device=device,
                )
            )
        )

        new_vector.append(
            embeddings["video"][dataset_item["cause_pos"]],
        )
        new_vector.append(
            embeddings["language"][dataset_item["cause_pos"]],
        )

        new_vector.append(
            self.special_embeddings(
                torch.tensor(
                    [self.BOS],
                    device=device,
                )
            )
        )

        new_vector = torch.vstack(
            new_vector,
        )
        # new_vector = new_vector.unsqueeze(0)
        # new_vector.shape
        return new_vector

    def tokenize(self, batch):
        pass

    def forward(self, batch):
        pass

In [27]:
from torch.utils.data import Dataset


class BERTCauseConversationsDataset(Dataset):
    def __init__(
        self,
        conversations,
        base_video_path="/code/SemEval-2024_Task3/training_data/train",
    ):
        self.conversations = conversations

        self.base_video_path = base_video_path

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        turn = self.conversations[idx]

        turn["video_base_path"] = self.base_video_path

        return turn


train_dataset = BERTCauseConversationsDataset(conversations=dataset_train)

train_dataset[0]

{'conversation': [{'emotion': 'sadness',
   'speaker': 'Monica',
   'text': 'Mr . Heckles .',
   'utterance_ID': 1,
   'video_name': 'dia187utt1.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Rachel',
   'text': 'How did this happen ?',
   'utterance_ID': 2,
   'video_name': 'dia187utt2.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr. Treeger',
   'text': 'He musta been sweeping . They found a broom in his hand .',
   'utterance_ID': 3,
   'video_name': 'dia187utt3.mp4'},
  {'emotion': 'sadness',
   'speaker': 'Monica',
   'text': 'That is terrible .',
   'utterance_ID': 4,
   'video_name': 'dia187utt4.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr. Treeger',
   'text': 'I know . I was sweeping yesterday . It coulda been me .',
   'utterance_ID': 5,
   'video_name': 'dia187utt5.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Ross',
   'text': 'Sure , you coulda . You never know .',
   'utterance_ID': 6,
   'video_name': 'dia187utt6.mp4'},
  {'emotion': 'neutral',
   'speaker': 'Mr.