## Sandbox with model

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch


from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [2]:
device = "cuda:0"
device = torch.device(device)
clip_type = {
    "video": "LanguageBind_Video_FT",  # also LanguageBind_Video
    # "audio": "LanguageBind_Audio_FT",  # also LanguageBind_Audio
    # "thermal": "LanguageBind_Thermal",
    # "image": "LanguageBind_Image",
    # "depth": "LanguageBind_Depth",
}

languagebind_model = LanguageBind(clip_type=clip_type, cache_dir="/code/cache_dir")
languagebind_model = languagebind_model.to(device)
# model.eval()
pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](languagebind_model.modality_config[c])
    for c in clip_type.keys()
}

### example of classifier

In [3]:
video = [
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
]
language = [
    "Two pandas are eating bamboo.",
    "Two pandas are eating bamboo.",
]

inputs = {
    "video": to_device(modality_transform["video"](video), device),
}
inputs["language"] = to_device(
    tokenizer(
        language,
        max_length=77,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ),
    device,
)


class VideoTextClassif(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = languagebind_model
        self.linear = torch.nn.Linear(768 * 2, 2, bias=False)

    def forward(self, x):
        result = self.model(x)
        # print(result)
        features = torch.cat(
            [
                result["video"],
                result["language"],
            ],
            dim=-1,
        )
        result = self.linear(features)
        return result


text_video_classif = VideoTextClassif()
text_video_classif.to(device)
output = text_video_classif(inputs)
loss_func = torch.nn.CrossEntropyLoss()
loss = loss_func(output, torch.tensor([1, 0], device=device))
print(loss)
loss.backward()

tensor(0.7536, device='cuda:0', grad_fn=<NllLossBackward0>)


### Эксперимент 1

Классификатор на основе текста и видео. На вход подается независимая реплика диалога и соответствующее видео к нему. Никакой следующий контекст диалога не используется.

На основе этого нужно предсказать эмоцию, то есть класс.

In [1]:
import numpy as np
import random


def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)


from datasets import load_dataset
import datasets
from torchvision.io import read_video
import json
import torch
import os
from torch.utils.data import Dataset, DataLoader
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


dataset_path = "./SemEval-2024_Task3/training_data/Subtask_2_train.json"


dataset = json.loads(open(dataset_path).read())
print(len(dataset))


# dataset[0]


all_conversations = []


for item in dataset:
    all_conversations.extend(item["conversation"])
print(len(all_conversations))


# all_emotions = set([])


# for item in all_conversations:


#     all_emotions.update([item["emotion"]])
# for item in all_conversations:
#     print(item['video_name'])


# print(all_emotions)


all_emotions = [
    "surprise",
    "fear",
    "sadness",
    "neutral",
    "joy",
    "anger",
    "disgust",
]


emotions2labels = {em: i for i, em in enumerate(all_emotions)}
labels2emotions = {i: em for i, em in enumerate(all_emotions)}


print(emotions2labels)
print(labels2emotions)


all_data = datasets.Dataset.from_list(all_conversations)
all_data = all_data.train_test_split(
    test_size=0.08,
    seed=42,
)
training_data, test_data = all_data["train"], all_data["test"]


class ConversationsDataset(Dataset):
    def __init__(
        self,
        conversations,
        base_video_path=None,
    ):
        self.conversations = conversations

        self.base_video_path = base_video_path

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        turn = self.conversations[idx]

        video_path = turn["video_name"]

        turn["video_name"] = f"{self.base_video_path}/{video_path}"

        turn["label"] = emotions2labels[turn["emotion"]]

        return turn


training_data = ConversationsDataset(
    conversations=training_data,
    base_video_path="/code/SemEval-2024_Task3/training_data/train",
)
test_data = ConversationsDataset(
    conversations=test_data,
    base_video_path="/code/SemEval-2024_Task3/training_data/train",
)


train_dataloader = DataLoader(training_data, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=False)


next(iter(test_dataloader))
# all_data.push_to_hub(
#     "dim/SemEval_training_data_emotions",
#     token=open("./hf_token").read(),
# )

  from .autonotebook import tqdm as notebook_tqdm


1374
13619
{'surprise': 0, 'fear': 1, 'sadness': 2, 'neutral': 3, 'joy': 4, 'anger': 5, 'disgust': 6}
{0: 'surprise', 1: 'fear', 2: 'sadness', 3: 'neutral', 4: 'joy', 5: 'anger', 6: 'disgust'}


{'utterance_ID': tensor([ 1, 10]),
 'text': ['Th ... th ... that is all it is , a third nipple .', 'What ? !'],
 'speaker': ['Ross', 'Phoebe'],
 'emotion': ['neutral', 'surprise'],
 'video_name': ['/code/SemEval-2024_Task3/training_data/train/dia424utt1.mp4',
  '/code/SemEval-2024_Task3/training_data/train/dia1255utt10.mp4'],
 'label': tensor([3, 0])}

In [8]:
len(all_conversations), len(training_data), len(test_data)

(13619, 13074, 545)

In [33]:
list(next(iter(train_dataloader)).keys())

['utterance_ID', 'text', 'speaker', 'emotion', 'video_name', 'label']

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from transformers.modeling_outputs import TokenClassifierOutput

from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)


class VideoTextClassif(torch.nn.Module):
    def __init__(self, labels=2, clip_type=None):
        super().__init__()
        self.model = LanguageBind(
            clip_type=clip_type,
            cache_dir="/code/cache_dir",
        )
        self.linear = torch.nn.Linear(
            768 * 2,
            labels,
            bias=False,
        )

    def forward(self, x):
        result = self.model(x)
        # print(result)
        features = torch.cat(
            [
                result["video"],
                result["language"],
            ],
            dim=-1,
        )
        result = self.linear(features)
        return result


device = "cuda:0"
device = torch.device(device)
clip_type = {
    "video": "LanguageBind_Video_FT",
}
text_video_classif = VideoTextClassif(
    labels=len(all_emotions),
    clip_type=clip_type,
)
text_video_classif = text_video_classif.to(device)
pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](text_video_classif.model.modality_config[c])
    for c in clip_type.keys()
}

  torchaudio.set_audio_backend("soundfile")


In [3]:
text_video_classif.eval()
batch = next(iter(train_dataloader))
inputs = {
    "video": to_device(
        modality_transform["video"](batch["video_name"]),
        device,
    ),
    "language": to_device(
        tokenizer(
            batch["text"],
            max_length=77,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ),
        device,
    ),
}

result = text_video_classif(inputs)
predicted_labels = result.argmax(-1).cpu().numpy()
test_f1_score = f1_score(
    batch["label"].numpy(),
    predicted_labels,
    average="macro",
)
print(test_f1_score)

0.0


In [4]:
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    text_video_classif.parameters(),
    lr=0.00001,
)
loss = loss_func(
    torch.tensor(
        [
            [0.45, 0.23],
            [0.55, 0.33],
        ],
        device=device,
    ),
    torch.tensor([1, 0], device=device),
)
loss

tensor(0.6992, device='cuda:0')

#### super simple train loop

In [None]:
epochs = 1

max_train_steps = 100

for epoch in range(epochs):
    for num_step, batch in tqdm.tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()
        # print(batch)
        inputs = {
            "video": to_device(
                modality_transform["video"](batch["video_name"]), device
            ),
            "language": to_device(
                tokenizer(
                    batch["text"],
                    max_length=77,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                ),
                device,
            ),
        }

        result = text_video_classif(inputs)
        label = batch["label"].to(device)
        loss = loss_func(result, label)
        print(num_step, loss.item())
        loss.backward()
        optimizer.step()

        if num_step > max_train_steps:
            break

    # for
    #     # break

### text_video_classif inference

In [1]:
from semeval.experiments.kosenko.language_bind.languagebind_classification import (
    VideoTextClassif,
)
from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)
import torch

all_emotions = [
    "surprise",
    "fear",
    "sadness",
    "neutral",
    "joy",
    "anger",
    "disgust",
]


emotions2labels = {em: i for i, em in enumerate(all_emotions)}


labels2emotions = {i: em for i, em in enumerate(all_emotions)}
device = "cuda:0"
device = torch.device(device)
clip_type = {
    "video": "LanguageBind_Video_FT",
}

text_video_classif_default = VideoTextClassif(
    labels=len(all_emotions),
    clip_type=clip_type,
)
text_video_classif = VideoTextClassif(
    labels=len(all_emotions),
    clip_type=clip_type,
)
text_video_classif.to(device=device)
text_video_classif_default.to(device=device)
text_video_classif.load_state_dict(
    torch.load(
        "./semeval/experiments/kosenko/language_bind/train_results/checkpoint-2040/pytorch_model.bin"
    )
)
text_video_classif.eval()
text_video_classif_default.eval()


pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](text_video_classif.model.modality_config[c])
    for c in clip_type.keys()
}

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [3]:
import numpy as np
import random


def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)


random_seed()

import json
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


dataset_path = "./SemEval-2024_Task3/training_data/Subtask_2_train.json"


dataset = json.loads(open(dataset_path).read())
print(len(dataset))

all_conversations = []

for item in dataset:
    all_conversations.extend(item["conversation"])
print(len(all_conversations))


training_data, test_data = train_test_split(
    all_conversations,
    test_size=0.04,
    random_state=42,
)

1374
13619


In [15]:
with torch.no_grad():
    for batch in test_data:
        # print(batch)
        base_video_path = "/code/SemEval-2024_Task3/training_data/train"
        video_name = batch["video_name"]
        video_name = f"{base_video_path}/{video_name}"
        inputs = {
            "video": to_device(
                modality_transform["video"](video_name),
                device,
            ),
            "language": to_device(
                tokenizer(
                    batch["text"],
                    max_length=77,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                ),
                device,
            ),
        }

        result = text_video_classif(inputs)
        result = result.argmax(-1).item()
        result = labels2emotions[result]
        print(f"Text: {batch['text']}")
        print(f'Predicted: {result}\nOriginal: {batch["emotion"]}')
        print("===")
        # break

Text: Whoa ho .
Predicted: joy
Original: joy
===
Text: No , I do not .
Predicted: disgust
Original: disgust
===
Text: We are you just ten seconds later !
Predicted: anger
Original: anger
===
Text: And Ross gave me this great book
Predicted: neutral
Original: neutral
===
Text: The movie theatre , you used to come in all the time .
Predicted: neutral
Original: neutral
===
Text: Nope , nope , I would just ah , I would rather talk to you .
Predicted: neutral
Original: neutral
===
Text: No ... no , it is not okay !
Predicted: neutral
Original: sadness
===
Text: Okay sweetie , you can do it . Just open up and put it in your mouth .
Predicted: neutral
Original: neutral
===
Text: What happened to the Disgustingtons ?
Predicted: neutral
Original: neutral
===
Text: Hey !
Predicted: joy
Original: joy
===
Text: But we ... we did not have ... sex ... uh , did we ?
Predicted: fear
Original: fear
===
Text: What do you think you are gonna do , have sex with her right here on my couch ?
Predicted: surp

KeyboardInterrupt: 