## Sandbox with model

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch


from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [2]:
device = "cuda:0"
device = torch.device(device)
clip_type = {
    "video": "LanguageBind_Video_FT",  # also LanguageBind_Video
    # "audio": "LanguageBind_Audio_FT",  # also LanguageBind_Audio
    # "thermal": "LanguageBind_Thermal",
    # "image": "LanguageBind_Image",
    # "depth": "LanguageBind_Depth",
}

languagebind_model = LanguageBind(clip_type=clip_type, cache_dir="/code/cache_dir")
languagebind_model = languagebind_model.to(device)
# model.eval()
pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](languagebind_model.modality_config[c])
    for c in clip_type.keys()
}

### example of classifier

In [3]:
video = [
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
    "semeval/experiments/kosenko/language_bind/LanguageBind/assets/video/0.mp4",
]
language = [
    "Two pandas are eating bamboo.",
    "Two pandas are eating bamboo.",
]

inputs = {
    "video": to_device(modality_transform["video"](video), device),
}
inputs["language"] = to_device(
    tokenizer(
        language,
        max_length=77,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ),
    device,
)


class VideoTextClassif(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = languagebind_model
        self.linear = torch.nn.Linear(768 * 2, 2, bias=False)

    def forward(self, x):
        result = self.model(x)
        # print(result)
        features = torch.cat(
            [
                result["video"],
                result["language"],
            ],
            dim=-1,
        )
        result = self.linear(features)
        return result


text_video_classif = VideoTextClassif()
text_video_classif.to(device)
output = text_video_classif(inputs)
loss_func = torch.nn.CrossEntropyLoss()
loss = loss_func(output, torch.tensor([1, 0], device=device))
print(loss)
loss.backward()

tensor(0.7536, device='cuda:0', grad_fn=<NllLossBackward0>)


### Эксперимент 1

Классификатор на основе текста и видео. На вход подается независимая реплика диалога и соответствующее видео к нему. Никакой следующий контекст диалога не используется.

На основе этого нужно предсказать эмоцию, то есть класс.

In [1]:
import numpy as np
import random


def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)


from datasets import load_dataset
from torchvision.io import read_video
import json
import torch
import os
from torch.utils.data import Dataset, DataLoader
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


dataset_path = "./SemEval-2024_Task3/training_data/Subtask_2_train.json"


dataset = json.loads(open(dataset_path).read())
print(len(dataset))


# dataset[0]


all_conversations = []


for item in dataset:
    all_conversations.extend(item["conversation"])
print(len(all_conversations))


# all_emotions = set([])


# for item in all_conversations:


#     all_emotions.update([item["emotion"]])
# for item in all_conversations:
#     print(item['video_name'])


# print(all_emotions)


all_emotions = [
    "surprise",
    "fear",
    "sadness",
    "neutral",
    "joy",
    "anger",
    "disgust",
]


emotions2labels = {em: i for i, em in enumerate(all_emotions)}


labels2emotions = {i: em for i, em in enumerate(all_emotions)}


print(emotions2labels)


print(labels2emotions)


training_data, test_data = train_test_split(all_conversations, test_size=0.04)


class ConversationsDataset(Dataset):
    def __init__(
        self,
        conversations,
        base_video_path="/code/SemEval-2024_Task3/training_data/train",
    ):
        self.conversations = conversations

        self.base_video_path = base_video_path

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        turn = self.conversations[idx]

        video_path = turn["video_name"]

        turn["video_name"] = f"{self.base_video_path}/{video_path}"

        turn["label"] = emotions2labels[turn["emotion"]]

        return turn


training_data = ConversationsDataset(conversations=training_data)
test_data = ConversationsDataset(conversations=test_data)


train_dataloader = DataLoader(training_data, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=False)


next(iter(train_dataloader))

  from .autonotebook import tqdm as notebook_tqdm


1374
13619
{'surprise': 0, 'fear': 1, 'sadness': 2, 'neutral': 3, 'joy': 4, 'anger': 5, 'disgust': 6}
{0: 'surprise', 1: 'fear', 2: 'sadness', 3: 'neutral', 4: 'joy', 5: 'anger', 6: 'disgust'}


{'utterance_ID': tensor([ 2, 13]),
 'text': ['Oh , I can not tell you how great it was to look at the crowd and see your face !',
  'Hey .'],
 'speaker': ['Ross', 'Rachel'],
 'emotion': ['joy', 'neutral'],
 'video_name': ['/code/SemEval-2024_Task3/training_data/train/dia1368utt2.mp4',
  '/code/SemEval-2024_Task3/training_data/train/dia147utt13.mp4'],
 'label': tensor([4, 3])}

In [8]:
len(all_conversations), len(training_data), len(test_data)

(13619, 13074, 545)

In [33]:
list(next(iter(train_dataloader)).keys())

['utterance_ID', 'text', 'speaker', 'emotion', 'video_name', 'label']

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from transformers.modeling_outputs import TokenClassifierOutput

from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
    LanguageBind,
    to_device,
    transform_dict,
    LanguageBindImageTokenizer,
)


class VideoTextClassif(torch.nn.Module):
    def __init__(self, labels=2, clip_type=None):
        super().__init__()
        self.model = LanguageBind(
            clip_type=clip_type,
            cache_dir="/code/cache_dir",
        )
        self.linear = torch.nn.Linear(
            768 * 2,
            labels,
            bias=False,
        )

    def forward(self, x):
        result = self.model(x)
        # print(result)
        features = torch.cat(
            [
                result["video"],
                result["language"],
            ],
            dim=-1,
        )
        result = self.linear(features)
        return result


device = "cuda:0"
device = torch.device(device)
clip_type = {
    "video": "LanguageBind_Video_FT",
}
text_video_classif = VideoTextClassif(
    labels=len(all_emotions),
    clip_type=clip_type,
)
text_video_classif = text_video_classif.to(device)
pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
tokenizer = LanguageBindImageTokenizer.from_pretrained(
    pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
)
modality_transform = {
    c: transform_dict[c](text_video_classif.model.modality_config[c])
    for c in clip_type.keys()
}

  torchaudio.set_audio_backend("soundfile")


In [3]:
text_video_classif.eval()
batch = next(iter(train_dataloader))
inputs = {
    "video": to_device(
        modality_transform["video"](batch["video_name"]),
        device,
    ),
    "language": to_device(
        tokenizer(
            batch["text"],
            max_length=77,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ),
        device,
    ),
}

result = text_video_classif(inputs)
predicted_labels = result.argmax(-1).cpu().numpy()
test_f1_score = f1_score(
    batch["label"].numpy(),
    predicted_labels,
    average="macro",
)
print(test_f1_score)

0.0


In [4]:
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    text_video_classif.parameters(),
    lr=0.00001,
)
loss = loss_func(
    torch.tensor(
        [
            [0.45, 0.23],
            [0.55, 0.33],
        ],
        device=device,
    ),
    torch.tensor([1, 0], device=device),
)
loss

tensor(0.6992, device='cuda:0')

#### super simple train loop

In [5]:
epochs = 1

max_train_steps = 100

for epoch in range(epochs):
    for num_step, batch in tqdm.tqdm(enumerate(train_dataloader)):
        optimizer.zero_grad()
        # print(batch)
        inputs = {
            "video": to_device(
                modality_transform["video"](batch["video_name"]), device
            ),
            "language": to_device(
                tokenizer(
                    batch["text"],
                    max_length=77,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                ),
                device,
            ),
        }

        result = text_video_classif(inputs)
        label = batch["label"].to(device)
        loss = loss_func(result, label)
        print(num_step, loss.item())
        loss.backward()
        optimizer.step()

        if num_step > max_train_steps:
            break

    # for
    #     # break

0it [00:00, ?it/s]

0 2.4116463661193848


1it [00:01,  1.19s/it]

1 2.118516206741333


2it [00:02,  1.01it/s]

2 1.8778287172317505


3it [00:02,  1.08it/s]

3 1.1899290084838867


4it [00:03,  1.14it/s]

4 2.954799175262451


5it [00:04,  1.20it/s]

5 0.45052316784858704


6it [00:05,  1.24it/s]

6 0.4964609742164612


7it [00:05,  1.26it/s]

7 2.371068000793457


8it [00:06,  1.28it/s]

8 0.45426976680755615


9it [00:07,  1.26it/s]

9 1.9046975374221802


10it [00:08,  1.28it/s]

10 0.7073445916175842


11it [00:09,  1.29it/s]

11 1.973649263381958


12it [00:09,  1.27it/s]

12 0.9785594344139099


13it [00:10,  1.27it/s]

13 2.038552761077881


14it [00:11,  1.29it/s]

14 1.4154369831085205


15it [00:12,  1.24it/s]

15 3.287311315536499


16it [00:13,  1.19it/s]

16 3.294203042984009


17it [00:13,  1.23it/s]

17 3.5777933597564697


18it [00:14,  1.25it/s]

18 0.9182525873184204


19it [00:15,  1.26it/s]

19 1.3086097240447998


20it [00:16,  1.28it/s]

20 2.188847541809082


21it [00:17,  1.29it/s]

21 2.370687484741211


22it [00:17,  1.30it/s]

22 2.256296396255493


23it [00:18,  1.31it/s]

23 1.4581425189971924


24it [00:19,  1.31it/s]

24 1.0644042491912842


25it [00:20,  1.31it/s]

25 1.9248542785644531


26it [00:20,  1.32it/s]

26 1.8852068185806274


27it [00:21,  1.32it/s]

27 1.6872577667236328


28it [00:22,  1.29it/s]

28 0.965337872505188


29it [00:23,  1.30it/s]

29 1.7070598602294922


30it [00:23,  1.30it/s]

30 3.010525941848755


31it [00:24,  1.31it/s]

31 2.3659281730651855


32it [00:25,  1.31it/s]

32 0.45751649141311646


33it [00:26,  1.32it/s]

33 0.5569820404052734


34it [00:26,  1.32it/s]

34 0.28989309072494507


35it [00:27,  1.31it/s]

35 2.116980791091919


36it [00:28,  1.28it/s]

36 2.9667627811431885


37it [00:29,  1.26it/s]

37 2.1935653686523438


38it [00:30,  1.15it/s]

38 3.322813034057617


39it [00:31,  1.19it/s]

39 5.079277992248535


40it [00:31,  1.22it/s]

40 1.2822527885437012


41it [00:32,  1.25it/s]

41 0.32359644770622253


42it [00:33,  1.27it/s]

42 3.404543399810791


43it [00:34,  1.28it/s]

43 2.111872673034668


44it [00:35,  1.16it/s]

44 1.7865755558013916


45it [00:36,  1.20it/s]

45 1.4486266374588013


46it [00:36,  1.19it/s]

46 1.4201889038085938


47it [00:37,  1.22it/s]

47 1.8882529735565186


48it [00:38,  1.25it/s]

48 1.2243683338165283


49it [00:39,  1.25it/s]

49 1.8630074262619019


50it [00:39,  1.27it/s]

50 2.4275169372558594


51it [00:40,  1.28it/s]

51 2.320770740509033


52it [00:41,  1.29it/s]

52 1.6899826526641846


53it [00:42,  1.30it/s]

53 2.2412166595458984


54it [00:43,  1.30it/s]

54 2.807105541229248


55it [00:43,  1.30it/s]

55 2.1958789825439453


56it [00:44,  1.21it/s]

56 2.089587688446045


57it [00:45,  1.24it/s]

57 2.406179189682007


58it [00:46,  1.26it/s]

58 1.3997547626495361


59it [00:47,  1.28it/s]

59 1.6681780815124512


60it [00:47,  1.29it/s]

60 1.758849859237671


61it [00:48,  1.30it/s]

61 2.2648091316223145


62it [00:49,  1.31it/s]

62 1.2803257703781128


63it [00:50,  1.31it/s]

63 1.9710869789123535


64it [00:50,  1.31it/s]

64 3.0484325885772705


65it [00:51,  1.32it/s]

65 2.3213601112365723


66it [00:52,  1.32it/s]

66 2.213437080383301


67it [00:53,  1.28it/s]

67 2.856510639190674


68it [00:53,  1.28it/s]

68 1.6671693325042725


69it [00:54,  1.30it/s]

69 2.405856132507324


70it [00:55,  1.30it/s]

70 3.700026512145996


71it [00:56,  1.31it/s]

71 1.2051950693130493


72it [00:56,  1.30it/s]

72 1.890251636505127


73it [00:57,  1.30it/s]

73 1.8185186386108398


74it [00:58,  1.30it/s]

74 1.4375560283660889


75it [00:59,  1.30it/s]

75 2.1991066932678223


76it [01:00,  1.31it/s]

76 1.668765664100647


77it [01:00,  1.31it/s]

77 1.2664744853973389


78it [01:01,  1.32it/s]

78 1.3477237224578857


79it [01:02,  1.31it/s]

79 1.5215569734573364


80it [01:03,  1.32it/s]

80 2.007652521133423


81it [01:04,  1.22it/s]

81 2.2828333377838135


82it [01:05,  1.13it/s]

82 0.8409034013748169


83it [01:05,  1.19it/s]

83 1.5698870420455933


84it [01:06,  1.21it/s]

84 3.00311279296875


85it [01:07,  1.20it/s]

85 1.9673261642456055


86it [01:08,  1.24it/s]

86 0.7008718252182007


87it [01:08,  1.26it/s]

87 2.5872793197631836


88it [01:09,  1.23it/s]

88 3.4001688957214355


89it [01:10,  1.26it/s]

89 3.4822511672973633


90it [01:11,  1.28it/s]

90 3.230837345123291


91it [01:12,  1.29it/s]

91 1.3050684928894043


92it [01:12,  1.29it/s]

92 1.7404236793518066


93it [01:13,  1.24it/s]

93 2.41038179397583


94it [01:14,  1.26it/s]

94 1.0576368570327759


95it [01:15,  1.28it/s]

95 1.3588182926177979


96it [01:16,  1.29it/s]

96 1.4618091583251953


97it [01:16,  1.30it/s]

97 1.3760355710983276


98it [01:17,  1.31it/s]

98 1.1598066091537476


99it [01:18,  1.31it/s]

99 1.3700892925262451


100it [01:19,  1.31it/s]

100 2.2770862579345703


101it [01:19,  1.31it/s]

101 1.2192869186401367


101it [01:20,  1.25it/s]


### huggingface trainer (падает по памяти)

In [12]:
# import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# import torch
# from transformers.modeling_outputs import TokenClassifierOutput

# from semeval.experiments.kosenko.language_bind.LanguageBind.languagebind import (
#     LanguageBind,
#     to_device,
#     transform_dict,
#     LanguageBindImageTokenizer,
# )
# from typing import Dict, List, Optional
# from torch import nn
# from torch.utils.data import Dataset
# from transformers import Trainer
# from transformers import TrainingArguments
# import datasets

# import numpy as np
# import random


# def random_seed(seed=42, rank=0):
#     torch.manual_seed(seed + rank)
#     np.random.seed(seed + rank)
#     random.seed(seed + rank)


# from datasets import load_dataset
# from torchvision.io import read_video
# import json
# import torch
# import os
# from torch.utils.data import Dataset, DataLoader
# import tqdm
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score


# class CustomTrainer(Trainer):
#     def compute_loss(
#         self,
#         model,
#         inputs,
#         return_outputs=False,
#     ):
#         video_paths = [
#             f"{base_path}/{video_path}"
#             for base_path, video_path in zip(
#                 inputs["video_base_path"], inputs["video_name"]
#             )
#         ]
#         custom_inputs = {
#             "video": to_device(
#                 modality_transform["video"](video_paths),
#                 device,
#             ),
#             "language": to_device(
#                 tokenizer(
#                     inputs["text"],
#                     max_length=77,
#                     padding="max_length",
#                     truncation=True,
#                     return_tensors="pt",
#                 ),
#                 device,
#             ),
#         }

#         # forward pass
#         outputs = model(custom_inputs)
#         label = inputs["label"].to(device)
#         loss_func = torch.nn.CrossEntropyLoss()
#         loss = loss_func(outputs, label)
#         torch.cuda.empty_cache()
#         return (loss, outputs) if return_outputs else loss

#     def get_train_dataloader(self):
#         train_dataset = self.train_dataset
#         return DataLoader(
#             train_dataset,
#             batch_size=self.args.per_device_train_batch_size,
#             shuffle=True,
#         )

#     def get_eval_dataloader(self, eval_dataset):
#         return DataLoader(
#             self.eval_dataset,
#             batch_size=self.args.per_device_eval_batch_size,
#             shuffle=False,
#         )

#     def prediction_step(
#         self,
#         model,
#         inputs,
#         prediction_loss_only,
#         ignore_keys=None,
#     ):
#         # print(inputs)
#         video_paths = [
#             f"{base_path}/{video_path}"
#             for base_path, video_path in zip(
#                 inputs["video_base_path"], inputs["video_name"]
#             )
#         ]
#         custom_inputs = {
#             "video": to_device(
#                 modality_transform["video"](video_paths),
#                 device,
#             ),
#             "language": to_device(
#                 tokenizer(
#                     inputs["text"],
#                     max_length=77,
#                     padding="max_length",
#                     truncation=True,
#                     return_tensors="pt",
#                 ),
#                 device,
#             ),
#         }

#         # forward pass
#         outputs = model(custom_inputs)
#         label = inputs["label"].to(device)
#         loss_func = torch.nn.CrossEntropyLoss()
#         loss = loss_func(outputs, label)
#         torch.cuda.empty_cache()
#         if prediction_loss_only:
#             return (loss, None, None)

#         return (loss, outputs, label)


# def compute_metrics(eval_preds):
#     # metric = evaluate.load("glue", "mrpc")
#     print(eval_preds)
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     # return metric.compute(predictions=predictions, references=labels)
#     return 1.0


# class VideoTextClassif(torch.nn.Module):
#     def __init__(self, labels=2, clip_type=None):
#         super().__init__()
#         self.model = LanguageBind(
#             clip_type=clip_type,
#             cache_dir="/code/cache_dir",
#         )
#         self.linear = torch.nn.Linear(
#             768 * 2,
#             labels,
#             bias=False,
#         )

#     def forward(self, x):
#         result = self.model(x)
#         # print(result)
#         features = torch.cat(
#             [
#                 result["video"],
#                 result["language"],
#             ],
#             dim=-1,
#         )
#         result = self.linear(features)
#         return result


# class ConversationsDataset(Dataset):
#     def __init__(
#         self,
#         conversations,
#         base_video_path="/code/SemEval-2024_Task3/training_data/train",
#     ):
#         self.conversations = conversations

#         self.base_video_path = base_video_path

#     def __len__(self):
#         return len(self.conversations)

#     def __getitem__(self, idx):
#         turn = self.conversations[idx]

#         turn["video_name"] = turn["video_name"]
#         turn["video_base_path"] = self.base_video_path
#         # print(video_path)
#         turn["label"] = emotions2labels[turn["emotion"]]

#         return turn


# if __name__ == "__main__":
#     dataset_path = "./SemEval-2024_Task3/training_data/Subtask_2_train.json"

#     dataset = json.loads(open(dataset_path).read())
#     print(len(dataset))

#     # dataset[0]

#     all_conversations = []

#     for item in dataset:
#         all_conversations.extend(item["conversation"])
#     print(len(all_conversations))

#     all_emotions = [
#         "surprise",
#         "fear",
#         "sadness",
#         "neutral",
#         "joy",
#         "anger",
#         "disgust",
#     ]

#     emotions2labels = {em: i for i, em in enumerate(all_emotions)}

#     labels2emotions = {i: em for i, em in enumerate(all_emotions)}

#     print(emotions2labels)

#     print(labels2emotions)

#     training_data_list, test_data_list = train_test_split(
#         all_conversations, test_size=0.04
#     )
#     training_data_list = training_data_list[:10]
#     test_data_list = test_data_list[:10]
#     training_data = ConversationsDataset(conversations=training_data_list)
#     test_data = ConversationsDataset(conversations=test_data_list)

#     # train_dataloader = DataLoader(training_data, batch_size=2, shuffle=True)
#     # test_dataloader = DataLoader(test_data, batch_size=2, shuffle=False)

#     # next(iter(train_dataloader))
#     device = "cuda:0"
#     device = torch.device(device)
#     clip_type = {
#         "video": "LanguageBind_Video_FT",
#     }
#     text_video_classif = VideoTextClassif(
#         labels=len(all_emotions),
#         clip_type=clip_type,
#     )
#     text_video_classif = text_video_classif.to(device)
#     # text_video_classif.half()
#     pretrained_ckpt = f"LanguageBind/LanguageBind_Image"
#     tokenizer = LanguageBindImageTokenizer.from_pretrained(
#         pretrained_ckpt, cache_dir="/code/cache_dir/tokenizer_cache_dir"
#     )
#     modality_transform = {
#         c: transform_dict[c](text_video_classif.model.modality_config[c])
#         for c in clip_type.keys()
#     }

#     training_args = TrainingArguments(
#         output_dir="semeval/experiments/kosenko/language_bind/train_results/",
#         evaluation_strategy="epoch",
#         eval_steps=1,
#         report_to="none",
#         per_device_train_batch_size=1,
#         per_device_eval_batch_size=1,
#         gradient_accumulation_steps=4,
#         fp16=True,
#         remove_unused_columns=False,
#         label_names=[
#             "utterance_ID",
#             "text",
#             "speaker",
#             "emotion",
#             "video_name",
#             "label",
#         ],
#     )

#     # hf_training_data = datasets.Dataset.from_list([item for item in training_data])
#     # hf_test_data = datasets.Dataset.from_list([item for item in test_data])

#     trainer = CustomTrainer(
#         model=text_video_classif,
#         args=training_args,
#         # train_dataset=hf_training_data,
#         # eval_dataset=hf_test_data,
#         train_dataset=training_data,
#         eval_dataset=test_data,
#         compute_metrics=compute_metrics,
#     )
#     trainer.train()

{'labels': tensor([5, 5], device='cuda:0'), 'utterance_ID': tensor([5, 5], device='cuda:0')}


KeyError: 'video_name'

In [9]:
hf_training_data.column_names

['utterance_ID', 'text', 'speaker', 'emotion', 'video_name', 'label']