In [2]:
pip install decord

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install einops

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install pysrt

Collecting pysrt
  Downloading pysrt-1.1.2.tar.gz (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pysrt
  Building wheel for pysrt (setup.py) ... [?25l[?25hdone
  Created wheel for pysrt: filename=pysrt-1.1.2-py3-none-any.whl size=13443 sha256=614e8989756229fcdf63c129a5d7299d959b90558a834226a96bfd8dde95c913
  Stored in directory: /root/.cache/pip/wheels/30/7f/e8/55de9a9b07302d9e7fe47c27910e3bea0c48536153e74bd7e6
Successfully built pysrt
Installing collected packages: pysrt
Successfully installed pysrt-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
from torch import nn

import torch.nn.functional as F
import pysrt
import re
import pickle
import math

import cv2
import os
import numpy as np
import pandas as pd

from decord import VideoReader, cpu
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from torchvision import transforms
import torchaudio
import random
import torch.optim as optim

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score

from sklearn.metrics import classification_report

from einops import rearrange, repeat, reduce
from einops.layers.torch import Rearrange

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Load Dataset

In [None]:
def read_data(video_dir, text_dir, audio_dir, output_file):
    data = {}

    for main_subfolder in os.listdir(video_dir):
        video_main_path = os.path.join(video_dir, main_subfolder)
        text_main_path = os.path.join(text_dir, main_subfolder)
        audio_main_path = os.path.join(audio_dir, main_subfolder)

        # Kiểm tra nếu folder chính tồn tại ở cả video, text và audio
        if os.path.isdir(video_main_path) and os.path.isdir(text_main_path) and os.path.isdir(audio_main_path):
            # Duyệt qua các subfolder (subfolderA1, subfolderA2, ...)
            for subfolder in os.listdir(video_main_path):
                video_subfolder_path = os.path.join(video_main_path, subfolder)
                text_subfolder_path = os.path.join(text_main_path, subfolder)
                audio_subfolder_path = os.path.join(audio_main_path, subfolder)

                # Kiểm tra nếu subfolder tồn tại ở cả video, text và audio
                if os.path.isdir(video_subfolder_path) and os.path.isdir(text_subfolder_path) and os.path.isdir(audio_subfolder_path):
                    # Duyệt qua các file .mp4 trong subfolder video
                    for file in os.listdir(video_subfolder_path):
                        if file.endswith('.mp4'):
                            base_name = os.path.splitext(file)[0]  # Tên file không có đuôi
                            video_path = os.path.join(video_subfolder_path, file)
                            text_path = os.path.join(text_subfolder_path, f"{base_name}.srt")
                            audio_path = os.path.join(audio_subfolder_path, f"{base_name}.mp3")

                            # Kiểm tra các file .srt và .mp3 tương ứng
                            if os.path.exists(text_path) and os.path.exists(audio_path):
                                data[base_name] = {"video": video_path, "text": text_path, "audio": audio_path}
                            else:
                                print(f"Warning: Missing text or audio file for {file}")

    # Lưu dữ liệu bằng pickle
    with open(output_file, "wb") as f:
        pickle.dump(data, f)
    print(f"Data saved to {output_file}")

    return data

In [None]:
video_train = "/kaggle/input/full-dataset/clip_train" 
text_train = "/kaggle/input/full-dataset/dialogue_train"
audio_train = "/kaggle/input/full-dataset/audio_train"
train_data = "/kaggle/working/train_full_3f.pkl"

video_test = "/kaggle/input/full-dataset/clip_test"
text_test = "/kaggle/input/full-dataset/dialogue_test"
audio_test = "/kaggle/input/full-dataset/audio_test"
test_data = "/kaggle/working/test_full_3f.pkl"

# Đọc dữ liệu và lưu vào pickle
train_data = read_data(video_train, text_train, audio_train, train_data)
test_data = read_data(video_test, text_test, audio_test, test_data)

In [None]:
class pre_processing(Dataset):
    def __init__(self, data_pickle, label_folder, transform=None, target_size=(128, 128), num_frames=256, label_mapping=None):
        self.data_pickle = data_pickle
        self.label_folder = label_folder
        self.transform = transform
        self.target_size = target_size
        self.num_frames = num_frames

        # Load dữ liệu từ pickle
        self.data = self._load_pickle(data_pickle)

        # Load labels từ CSV
        self.labels = self._load_labels()

        # Lưu danh sách các keys (tên file) để truy cập dữ liệu
        self.keys = list(self.data.keys())
        
        # Default label mapping nếu không có
        self.label_mapping = label_mapping if label_mapping else {
            "asks": 0, "gives to": 1, "talks to": 2, "walks with": 3,
            "watches": 4, "yells at": 5, "no_interaction": 6
        }

    def _load_pickle(self, path):
        with open(path, "rb") as f:
            data = pickle.load(f)
        return data

    def _load_labels(self):
        labels = {}

        # Duyệt qua tất cả thư mục con trong thư mục cha
        for root, dirs, files in os.walk(self.label_folder):
            for file in files:
                # Kiểm tra nếu file là .csv
                if file.endswith('.csv'):
                    label_path = os.path.join(root, file)
                    
                    # Đọc file csv
                    df = pd.read_csv(label_path)
                    for _, row in df.iterrows():
                        # Lưu nhãn tương ứng với clip
                        labels[row['Clip']] = row['Interaction']
        
        return labels

    def _read_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = self._resize_frame(frame)  # Resize frame về kích thước cố định
            frames.append(frame)
        cap.release()

        # Lấy mẫu số lượng frame cố định
        frames = self._sample_frames(frames)

        # Chuyển đổi frames thành tensor
        frames_array = np.array(frames, dtype=np.float32)
        frames_array /= 255.0  # Chuẩn hóa giá trị về [0, 1]

        # Chuyển đổi từ (F, H, W, C) sang (C, F, H, W)
        video_tensor = torch.from_numpy(frames_array).permute(3, 0, 1, 2)  # (C, F, H, W)
        return video_tensor

    def _resize_frame(self, frame):
        return cv2.resize(frame, self.target_size, interpolation=cv2.INTER_AREA)

    def _sample_frames(self, frames):
        num_total_frames = len(frames)
    
        if num_total_frames >= self.num_frames:
            # Phân bổ đều các nhóm frame
            indices = np.linspace(0, num_total_frames - 1, self.num_frames, dtype=int)
            sampled_frames = [frames[idx] for idx in indices]
        else:
            # Nếu số frame ít hơn, lấy ngẫu nhiên các frame để đủ số lượng
            extra_frames = np.random.choice(num_total_frames, self.num_frames - num_total_frames, replace=True)
            sampled_frames = frames + [frames[idx] for idx in extra_frames]
        return sampled_frames

    def _clean_text(self, text):
        text = re.sub(r"<.*?>", "", text)  # Xóa thẻ HTML
        text = re.sub(r"\([^\)]+\)", "", text)  # Xóa văn bản trong dấu ()
        text = re.sub(r"\[[^\]]+\]", "", text)  # Xóa văn bản trong dấu []
        text = re.sub(r"\s+", " ", text).strip()  # Xóa khoảng trắng dư thừa
        return text

    def _read_srt(self, srt_path):
        subs = pysrt.open(srt_path, encoding='utf-8')
        dialogues = []
        for sub in subs:
            dialogue = self._clean_text(sub.text)
            dialogues.append(dialogue)
        return dialogues

    def pad_texts(self, texts, pad_token="<PAD>"):
        max_length = max(len(text) for text in texts)  # Độ dài lớn nhất
        padded_texts = []
        for text in texts:
            # Thêm padding token nếu câu ngắn hơn max_length
            padded = text + [pad_token] * (max_length - len(text))
            padded_texts.append(padded[:max_length])  # Cắt nếu vượt quá max_length
        return padded_texts


    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
    
        video_path = self.data[key]["video"]
        text_path = self.data[key]["text"]
        audio_path = self.data[key]["audio"]
    
        video_tensor = self._read_video(video_path)
        
        text = self._read_srt(text_path) 
        text = self.pad_texts([text])[0]
    
        # Lấy nhãn tương ứng
        label = self.labels.get(key, None)
    
        label_idx = self.label_mapping.get(label, -1)  # Lấy chỉ số nhãn từ label_mapping
    
        # Áp dụng transform (nếu có)
        if self.transform:
            video_tensor = self.transform(video_tensor)
    
        return (video_tensor, label_idx, video_path), (text_path, text), audio_path

In [None]:
def custom_collate_fn(batch):
    """
    Hàm collate cho DataLoader để kết hợp các mẫu vào batch.
    """
    # Tách các phần tử từ batch
    video_tensors = []
    labels = []
    video_paths = []
    text_paths = []
    texts = []
    audio_paths = []
    
    for item in batch:
        (video_tensor, label, video_path), (text_path, text), audio_path = item
        video_tensors.append(video_tensor)
        labels.append(label)
        video_paths.append(video_path)
        text_paths.append(text_path)
        texts.append(text)
        audio_paths.append(audio_path)

    # Kết hợp video tensor thành batch
    video_tensors = torch.stack(video_tensors)  # (batch_size, C, F, H, W)

    max_text_len = max(len(t) for t in texts)
    padded_texts = [
        t + ["<PAD>"] * (max_text_len - len(t)) for t in texts
    ]

    # Trả về dictionary
    return {
        "videos": video_tensors,
        "labels": labels,
        "video_paths": video_paths,
        "text_paths": text_paths,
        "texts": padded_texts,
        "audio_paths": audio_paths,
    }

In [None]:
# Tạo dataset từ pickle
train_dataset = pre_processing(
    data_pickle='/kaggle/input/checkpoint/full_v_t_a/train_full_data.pkl',
    label_folder='/kaggle/input/full-dataset/train_labels',
    transform=None
)

test_dataset = pre_processing(
    data_pickle='/kaggle/input/checkpoint/full_v_t_a/test_full_data.pkl',
    label_folder='/kaggle/input/full-dataset/test_labels',
    transform=None
)

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    collate_fn=custom_collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=custom_collate_fn
)

In [None]:
dataset_size = len(train_loader)
print(dataset_size)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def count_labels(label_folder):
    labels = []

    for root, dirs, files in os.walk(label_folder):
        for file in files:
            # Check if the file is a .csv
            if file.endswith('.csv'):
                label_path = os.path.join(root, file)
                
                # Read the CSV file
                df = pd.read_csv(label_path)
                labels.extend(df['Interaction'].tolist())
    
    return labels

# Specify the label folder
label_folder = '/kaggle/input/full-dataset/train_labels'
all_labels = count_labels(label_folder)
label_counts = Counter(all_labels)

# Define the fixed order of the classes (e.g., sorted alphabetically)
fixed_order = ["asks", "gives to", "talks to", "walks with", "watches", "yells at", "no_interaction"]

# Sort the label_counts keys based on the fixed order
sorted_labels = sorted(label_counts.keys(), key=lambda x: fixed_order.index(x) if x in fixed_order else len(fixed_order))

# Sort the counts based on the sorted labels
sorted_counts = [label_counts[label] for label in sorted_labels]

# Plot the bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(sorted_labels, sorted_counts, color='darkcyan')
plt.xlabel('Classes')
plt.ylabel('Number of Samples')
plt.title('Distribution of Samples in TrainSet')
plt.xticks(rotation=45)

# Add label on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 1, int(yval), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## ViViT

In [None]:
# helpers
def exists(val):
    return val is not None

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

In [None]:
class FeedForward(nn.Module):
    
    def __init__(self, dim, hidden_dim, dropout = 0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.3):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.norm = nn.LayerNorm(dim)
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

In [None]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.3):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
                FeedForward(dim, mlp_dim, dropout = dropout)]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return self.norm(x)

In [None]:
class FactorizedTransformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.2):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))

    def forward(self, x):
        b, f, n, _ = x.shape
        for spatial_attn, temporal_attn, ff in self.layers:
            x = rearrange(x, 'b f n d -> (b f) n d')
            x = spatial_attn(x) + x
            x = rearrange(x, '(b f) n d -> (b n) f d', b=b, f=f)
            x = temporal_attn(x) + x
            x = ff(x) + x
            x = rearrange(x, '(b n) f d -> b f n d', b=b, n=n)

        return self.norm(x)

In [None]:
class TubeletEmbedding(nn.Module):
    def __init__(self, channels, tubelet_size, patch_size, dim):
        super().__init__()
        self.tubelet_size = tubelet_size
        self.patch_size = patch_size
        patch_dim = channels * tubelet_size * patch_size ** 2
        self.to_patch_embedding = nn.Sequential(
            # Điều chỉnh Rearrange để có đủ số chiều
            Rearrange('b c (f tf) (h ph) (w pw) -> b f (h w) (tf ph pw c)',
                      tf=tubelet_size, ph=patch_size, pw=patch_size),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim)
        )

    def forward(self, video):
        return self.to_patch_embedding(video)

In [None]:
class Video_ViT(nn.Module):
    def __init__(
        self,
        *,
        image_size,
        image_patch_size,
        frames,
        frame_patch_size,
        num_classes,
        dim,
        spatial_depth,
        temporal_depth,
        heads,
        mlp_dim,
        pool = 'cls',
        channels = 3,
        dim_head = 64,
        dropout = 0.3,
        emb_dropout = 0.3,
        variant = 'factorized_self_attention',):
        
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(image_patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
        assert frames % frame_patch_size == 0, 'Frames must be divisible by frame patch size'
        assert variant in ('factorized_encoder', 'factorized_self_attention'), f'variant = {variant} is not implemented'

        num_image_patches = (image_height // patch_height) * (image_width // patch_width)
        num_frame_patches = (frames // frame_patch_size)

        #patch_dim = channels * patch_height * patch_width * frame_patch_size

        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.global_average_pool = pool == 'mean'
        
        #Tubelet or Uniform sampling
        
        self.to_patch_embedding = TubeletEmbedding(
            channels=channels,
            tubelet_size=frame_patch_size,
            patch_size=patch_height,
            dim=dim)

        self.pos_embedding = nn.Parameter(torch.randn(1, num_frame_patches, num_image_patches, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.spatial_cls_token = nn.Parameter(torch.randn(1, 1, dim)) if not self.global_average_pool else None

        if variant == 'factorized_encoder':
            self.temporal_cls_token = nn.Parameter(torch.randn(1, 1, dim)) if not self.global_average_pool else None
            self.spatial_transformer = Transformer(dim, spatial_depth, heads, dim_head, mlp_dim, dropout)
            self.temporal_transformer = Transformer(dim, temporal_depth, heads, dim_head, mlp_dim, dropout)
        elif variant == 'factorized_self_attention':
            assert spatial_depth == temporal_depth, 'Spatial and temporal depth must be the same for factorized self-attention'
            self.factorized_transformer = FactorizedTransformer(dim, spatial_depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        #self.mlp_head = nn.Linear(dim, num_classes)
        self.variant = variant
             
    def extract_features(self, video):
        x = self.to_patch_embedding(video)
        b, f, n, _ = x.shape
        x = x + self.pos_embedding[:, :f, :n]

        if exists(self.spatial_cls_token):
            spatial_cls_tokens = repeat(self.spatial_cls_token, '1 1 d -> b f 1 d', b=b, f=f)
            x = torch.cat((spatial_cls_tokens, x), dim=2)

        x = self.dropout(x)

        if self.variant == 'factorized_encoder':
            x = rearrange(x, 'b f n d -> (b f) n d')
            x = self.spatial_transformer(x)
            x = rearrange(x, '(b f) n d -> b f n d', b=b)
            x = x[:, :, 0] if not self.global_average_pool else reduce(x, 'b f n d -> b f d', 'mean')
            if exists(self.temporal_cls_token):
                temporal_cls_tokens = repeat(self.temporal_cls_token, '1 1 d-> b 1 d', b=b)
                x = torch.cat((temporal_cls_tokens, x), dim=1)
            x = self.temporal_transformer(x)
            x = x[:, 0] if not self.global_average_pool else reduce(x, 'b f d -> b d', 'mean')

        elif self.variant == 'factorized_self_attention':
            x = self.factorized_transformer(x)
            x = x[:, 0, 0] if not self.global_average_pool else reduce(x, 'b f n d -> b d', 'mean')

        return x

    def forward(self, video):
        x = self.extract_features(video)
        return x  # Trả về đặc trưng video mà không qua lớp phân loại

## Visual features

In [None]:
ViViT_model = Video_ViT(
    image_size = 128,          # image size
    frames = 256,               # number of frames
    image_patch_size = 16,     # image patch size
    frame_patch_size = 2,      # frame patch size
    num_classes = 7,
    dim = 1024,
    spatial_depth = 6,         # depth of the spatial transformer
    temporal_depth = 6,        # depth of the temporal transformer
    heads = 8,
    mlp_dim = 2048,
    variant = 'factorized_self_attention', # or 'factorized_encoder'
)

In [None]:
ViViT_model.to(device)

## Bert:

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces = True)  # Load tokenizer của BERT
bert_model = AutoModel.from_pretrained("bert-base-uncased")  # Load mô hình BERT

bert_model.eval()

In [None]:
def process_texts(texts, tokenizer, device, max_length=512):
    # Chuẩn hóa văn bản: loại bỏ ký tự đặc biệt
    texts = [re.sub(r'[^\w\s.,!?-]', '', str(text)) for text in texts]
    
    # Token hóa văn bản
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    
    return input_ids, attention_mask

## Audio feature

In [None]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
AST_model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

AST_model.eval()

In [None]:
def extract_audio_features(audio_paths):
    features = []
    for audio_path in audio_paths:
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert stereo to mono nếu cần

        # Resample về 16kHz nếu không đúng
        if sample_rate != 16000:
            resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resample_transform(waveform)

        # Chuyển đổi waveform thành input cho AST model
        inputs = feature_extractor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
        
        # Trích xuất đặc trưng mà không cần thực hiện phân loại
        with torch.no_grad():
            outputs = AST_model.audio_spectrogram_transformer(**inputs)
            feature = outputs.last_hidden_state

        pooled_features = torch.mean(feature, dim=1)
        features.append(pooled_features.squeeze(0)) 
    return torch.stack(features)

## Extract feature

In [None]:
def extract_features(data_loader, visual_model, text_model, tokenizer, device, output_path):
    visual_features_list = []
    text_features_list = []
    audio_features_list = []
    
    labels_list = []
    
    video_paths_list = []
    text_paths_list = [] 
    audio_paths_list = [] 

    visual_model.to(device)
    text_model.to(device)
    
    for i, batch in enumerate(data_loader):
        videos = batch['videos'].float().to(device)
        texts = batch['texts']
        labels = batch['labels']
        
        video_paths = batch['video_paths']
        text_paths = batch['text_paths']
        audio_paths = batch['audio_paths']


        with torch.no_grad():
            visual_features = visual_model.extract_features(videos)

        
        input_ids, attention_mask = process_texts(texts, tokenizer, device)
        with torch.no_grad():
            outputs = text_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Đặc trưng từ CLS token

        
        audio_embeddings = extract_audio_features(audio_paths)
    
        # Lưu các đặc trưng và labels vào danh sách
        visual_features_list.extend(visual_features.tolist())
        text_features_list.extend(cls_embeddings.tolist())
        audio_features_list.extend(audio_embeddings.tolist())
        labels_list.extend(labels)
        
        video_paths_list.extend(video_paths)
        text_paths_list.extend(text_paths)
        audio_paths_list.extend(audio_paths)

        # In ra thông báo mỗi 50 batch
        if (i + 1) % 50 == 0:
            print(f"Batch {i + 1}/{len(data_loader)} completed.")

    features_data = {
        'video_paths': video_paths_list,
        'visual_features': visual_features_list,
        'text_paths': text_paths_list,
        'text_features': text_features_list,
        'audio_paths': audio_paths_list,
        'audio_features': audio_features_list,
        'labels': labels_list,
    }

    with open(output_path, "wb") as f:
        pickle.dump(features_data, f)
    
    print(f"Features saved to {output_path}")

In [None]:
train_feats = '/kaggle/working/train_full_v_t_a.pkl'
extract_features(train_loader, ViViT_model, bert_model, tokenizer, device, train_feats)

In [None]:
test_feats = '/kaggle/working/test_full_v_t_a.pkl'
extract_features(test_loader, ViViT_model, bert_model, tokenizer, device, test_feats)

# Load features

In [106]:
def load_features(input_path):
    with open(input_path, "rb") as f:
        features_data = pickle.load(f)

    video_paths = features_data['video_paths']
    visual_features = torch.tensor(features_data['visual_features'])
    
    text_paths = features_data['text_paths']
    text_features = torch.tensor(features_data['text_features'])
    
    audio_paths = features_data['audio_paths']
    audio_features = torch.tensor(features_data['audio_features'])
    
    labels = torch.tensor(features_data['labels']).to(device)

    return video_paths, visual_features, text_paths, text_features, audio_paths, audio_features, labels

In [107]:
def group_features(video_paths, visual_features, text_paths, text_features, audio_paths, audio_features, labels):
    scene_feats = {}

    for i, video_path in enumerate(video_paths):
        # Extract scene name from video path
        scene_name = "-".join(os.path.basename(video_path).split("-")[:2])

        if scene_name not in scene_feats:
            scene_feats[scene_name] = {
                'visual_features': [],
                'text_features': [],
                'audio_features': [],
                
                'labels': [],
                'video_paths': [],
                'text_paths': [],
                'audio_paths': [],
            }

        scene_feats[scene_name]['visual_features'].append(visual_features[i].tolist())
        scene_feats[scene_name]['text_features'].append(text_features[i].tolist())
        scene_feats[scene_name]['audio_features'].append(audio_features[i].tolist())
        
        scene_feats[scene_name]['labels'].append(labels[i].tolist())
        
        scene_feats[scene_name]['video_paths'].append(video_paths[i])
        scene_feats[scene_name]['text_paths'].append(text_paths[i])
        scene_feats[scene_name]['audio_paths'].append(audio_paths[i])

    # Convert lists to tensors
    for scene_name in scene_feats:
        scene_feats[scene_name]['visual_features'] = torch.tensor(scene_feats[scene_name]['visual_features'])
        scene_feats[scene_name]['text_features'] = torch.tensor(scene_feats[scene_name]['text_features'])
        scene_feats[scene_name]['audio_features'] = torch.tensor(scene_feats[scene_name]['audio_features'])

        scene_feats[scene_name]['labels'] = torch.tensor(scene_feats[scene_name]['labels'])

    return scene_feats

In [108]:
features_path = "/kaggle/input/checkpoint/full_v_t_a/train_full_v_t_a.pkl"
video_paths, visual_features, text_paths, text_features, audio_paths, audio_features, labels = load_features(features_path)
scene_feats = group_features(video_paths, visual_features, text_paths, text_features, audio_paths, audio_features, labels)

with open("/kaggle/working/grouped_features.pkl", "wb") as f:
    pickle.dump(scene_feats, f)
print(f"Grouped features saved")

Grouped features saved


In [109]:
for scene_name, scene_data in scene_feats.items():
    scene_data['labels'] = F.one_hot(scene_data['labels'], num_classes=7)

In [110]:
def padding(features, max_length, pad_value=0):
    """
    Hàm padding cho tensor, hỗ trợ cả 1D và 2D tensor.
    """
    if len(features.shape) == 1:
        # Padding cho tensor 1D
        padded_features = torch.nn.functional.pad(
            features, (0, max_length - features.size(0)), value=pad_value
        )
    elif len(features.shape) == 2:
        # Padding cho tensor 2D
        padded_features = torch.nn.functional.pad(
            features, (0, 0, 0, max_length - features.size(0)), value=pad_value
        )
    else:
        raise ValueError(f"Unsupported tensor shape: {features.shape}")
    return padded_features

def pad_scene_level(scene_data):
    """
    Hàm padding tất cả các scene để đảm bảo có cùng độ dài sequence.
    """
    scene_names = list(scene_data.keys())

    # Tính max_length từ visual_features
    max_length = max(scene_data[scene_name]['visual_features'].size(0) for scene_name in scene_names)

    for scene_name in scene_names:
        try:
            visual_features = scene_data[scene_name]['visual_features']
            text_features = scene_data[scene_name]['text_features']
            audio_features = scene_data[scene_name]['audio_features']
            labels = scene_data[scene_name]['labels']

            # Padding cho từng loại dữ liệu
            scene_data[scene_name]['visual_features'] = padding(visual_features, max_length, pad_value=0) 
            scene_data[scene_name]['text_features'] = padding(text_features, max_length, pad_value=0)
            scene_data[scene_name]['audio_features'] = padding(audio_features, max_length, pad_value=0)      

            scene_data[scene_name]['labels'] = padding(labels, max_length, pad_value=0)                 
        except KeyError as e:
            raise KeyError(f"Missing key in scene data: {e}")
    return scene_data

In [111]:
scene_feats_padded = pad_scene_level(scene_feats)

In [112]:
visual_features = []
text_features = []
audio_features = []

labels = []

for scene_name, scene in scene_feats_padded.items():
    visual_features.append(scene['visual_features'])
    text_features.append(scene['text_features'])
    audio_features.append(scene['audio_features'])
    
    labels.append(scene['labels'])

# Chuyển các list thành tensor
visual_features = torch.stack(visual_features)
text_features = torch.stack(text_features)
audio_features = torch.stack(audio_features)
labels = torch.stack(labels)

train_feats = TensorDataset(visual_features, text_features, audio_features, labels)

train_size = int(0.78 * len(train_feats))
val_size = len(train_feats) - train_size
train, val = random_split(train_feats, [train_size, val_size])

In [None]:
# Kiểm tra kết quả padding
'''for scene_name, scene in scene_feats_padded.items():
    print(f"Scene: {scene_name}")
    print(f"Visual features:\n{scene['visual_features'].shape}")
    #print(f"  Text features shape: {scene['text_features'].shape}")
    #print(f"  Audio features shape: {scene['audio_features'].shape}")
    print(f"  Labels shape: {scene['labels'].shape}")
    break'''

In [113]:
train_feats_loader = DataLoader(train, batch_size=16, shuffle=True)
val_feats_loader = DataLoader(val, batch_size=16, shuffle=True)

print(f"Train set: {len(train_feats_loader.dataset)} scenes")
print(f"Validation set: {len(val_feats_loader.dataset)} scenes")

Train set: 397 scenes
Validation set: 112 scenes


## Classifier

In [114]:
class CrossAttention(nn.Module):
    def __init__(self, dim, num_heads=8, dropout=0.4):
        super(CrossAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.norm = nn.LayerNorm(dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context, attn_mask=None, key_padding_mask=None):
        attn_output, _ = self.multihead_attn(
            x, context, context, 
            attn_mask=attn_mask, 
            key_padding_mask=key_padding_mask
        )
        attn_output = self.dropout(attn_output)
        return self.norm(attn_output + x)

In [None]:
class MultimodalLSTM(nn.Module):
    def __init__(self, hidden_dim=512, num_labels=7, dropout_rate=0.3):
        super(MultimodalLSTM, self).__init__()

        # LSTM layers (Bidirectional → output size = hidden_dim * 2)
        self.visual_lstm = nn.LSTM(input_size=1024, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.text_lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.audio_lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)

        # Cross-Attention (adjusted to hidden_dim * 2 due to bidirectional LSTM)
        self.cross_attn_vt = CrossAttention(hidden_dim * 2)  
        self.cross_attn_va = CrossAttention(hidden_dim * 2)  
        self.cross_attn_ta = CrossAttention(hidden_dim * 2) 

        # Fusion MLP
        self.fusion_fc1 = nn.Sequential(
            nn.Linear(hidden_dim * 6, 1024),  # hidden_dim * 2 per modality → 6 modalities
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Dropout(dropout_rate)
        )

        self.fusion_fc2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Dropout(dropout_rate)
        )

        self.fusion_fc3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Dropout(dropout_rate)
        )

        self.output_layer = nn.Linear(256, num_labels)

    def forward(self, visual_input, text_input, audio_input):

        # LSTM Encoding
        visual_feat, _ = self.visual_lstm(visual_input)  
        text_feat, _ = self.text_lstm(text_input)    
        audio_feat, _ = self.audio_lstm(audio_input)

        # Cross-Attention (Parallel Processing)
        visual_feat = self.cross_attn_vt(visual_feat, text_feat)
        visual_feat = self.cross_attn_va(visual_feat, audio_feat)

        text_feat = self.cross_attn_ta(text_feat, audio_feat)
        text_feat = self.cross_attn_vt(text_feat, visual_feat)

        audio_feat = self.cross_attn_va(audio_feat, visual_feat)
        audio_feat = self.cross_attn_ta(audio_feat, text_feat)

        # Global average pooling (reduce temporal dimension)
        visual_feat = torch.mean(visual_feat, dim=1)  
        text_feat = torch.mean(text_feat, dim=1)  
        audio_feat = torch.mean(audio_feat, dim=1)  

        # Concatenation of all modalities
        fusion_out = torch.cat((visual_feat, text_feat, audio_feat), dim=-1)  # (batch_size, hidden_dim * 6)

        # MLP Fusion
        fusion_out = self.fusion_fc1(fusion_out)
        fusion_out = self.fusion_fc2(fusion_out)
        fusion_out = self.fusion_fc3(fusion_out)

        # Output layer
        output = self.output_layer(fusion_out)  # (batch_size, num_labels)

        return output

## cross-attention base

In [None]:
'''class CrossAttention(nn.Module):
    def __init__(self, dim, num_heads=8, dropout=0.4):
        super(CrossAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.norm = nn.LayerNorm(dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context, attn_mask=None, key_padding_mask=None):
        attn_output, _ = self.multihead_attn(
            x, context, context, 
            attn_mask=attn_mask, 
            key_padding_mask=key_padding_mask
        )
        attn_output = self.dropout(attn_output)
        return self.norm(attn_output + x)'''

In [None]:
'''class MultimodalLSTM(nn.Module):
    def __init__(self, hidden_dim=512, num_labels=7, dropout_rate=0.3):
        super(MultimodalLSTM, self).__init__()

        # LSTM layers (Bidirectional → output size = hidden_dim * 2)
        self.visual_lstm = nn.LSTM(input_size=1024, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.text_lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.audio_lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)

        # Cross-Attention
        self.cross_attn_vt = CrossAttention(hidden_dim * 2)
        self.cross_attn_va = CrossAttention(hidden_dim * 2)
        self.cross_attn_ta = CrossAttention(hidden_dim * 2)

        # Fusion MLP
        self.fusion_fc1 = nn.Sequential(
            nn.Linear(hidden_dim * 6, 1024),
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Dropout(dropout_rate)
        )

        self.fusion_fc2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Dropout(dropout_rate)
        )

        self.fusion_fc3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Dropout(dropout_rate)
        )

        self.output_layer = nn.Linear(256, num_labels)

    def forward(self, visual_input, text_input, audio_input):
        # Normalize inputs
        text_input = (text_input - text_input.mean(dim=1, keepdim=True)) / (text_input.std(dim=1, keepdim=True) + 1e-6)
        audio_input = (audio_input - audio_input.mean(dim=1, keepdim=True)) / (audio_input.std(dim=1, keepdim=True) + 1e-6)

        # LSTM Encoding
        visual_feat, _ = self.visual_lstm(visual_input)
        text_feat, _ = self.text_lstm(text_input)
        audio_feat, _ = self.audio_lstm(audio_input)

        # Cross-Attention
        visual_feat = self.cross_attn_vt(visual_feat, text_feat)
        visual_feat = self.cross_attn_va(visual_feat, audio_feat)
        text_feat = self.cross_attn_ta(text_feat, audio_feat)

        # Global average pooling
        visual_feat = torch.mean(visual_feat, dim=1)
        text_feat = torch.mean(text_feat, dim=1)
        audio_feat = torch.mean(audio_feat, dim=1)

        # Concatenation of all modalities
        fusion_out = torch.cat((visual_feat, text_feat, audio_feat), dim=-1)

        # MLP Fusion
        fusion_out = self.fusion_fc1(fusion_out)
        fusion_out = self.fusion_fc2(fusion_out)
        fusion_out = self.fusion_fc3(fusion_out)

        # Output layer
        output = self.output_layer(fusion_out)

        return output'''

In [116]:
hidden_dim = 512
num_labels = 7
dropout_rate = 0.3

# Model
classifier_model = MultimodalLSTM(hidden_dim=hidden_dim, num_labels=num_labels, dropout_rate=dropout_rate)
classifier_model.to(device)

MultimodalLSTM(
  (visual_lstm): LSTM(1024, 512, num_layers=2, batch_first=True, bidirectional=True)
  (text_lstm): LSTM(768, 512, num_layers=2, batch_first=True, bidirectional=True)
  (audio_lstm): LSTM(768, 512, num_layers=2, batch_first=True, bidirectional=True)
  (cross_attn_vt): CrossAttention(
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
    )
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (cross_attn_va): CrossAttention(
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
    )
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (cross_attn_ta): CrossAttention(
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1

In [None]:
# Số lượng mẫu trên từng nhãn
class_counts = torch.tensor([928, 109, 2596, 187, 358, 166, 2959])

total_samples = class_counts.sum()

num_negative = total_samples - class_counts
num_positive = class_counts

class_weights = num_negative / num_positive
print(class_weights)

tensor([ 6.8696, 66.0000,  1.8132, 38.0535, 19.3994, 42.9940,  1.4681])


## Training

In [118]:
def get_interaction_scene(labels):
    
    labels_merged = []
    for scene_label in labels:
        
        merged_label = np.any(scene_label, axis=0).astype(int)

        labels_merged.append(merged_label)
    
    return labels_merged

In [119]:
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights).to(device)

# Cấu hình optimizer và scheduler
optimizer = optim.AdamW(classifier_model.parameters(), lr=0.00003, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)

In [None]:
num_epochs = 60
save_dir = "/kaggle/working/"

for epoch in range(num_epochs):
    classifier_model.train()
    running_loss = 0.0

    for visual_data, text_data, audio_data, labels in train_feats_loader:
        # Chuyển dữ liệu sang thiết bị
        visual_data, text_data, audio_data, labels = visual_data.to(device), text_data.to(device), audio_data.to(device), labels.to(device)
        
        # Hợp nhất nhãn cho toàn bộ scene
        scene_labels = np.array(get_interaction_scene(labels.cpu().numpy()))  # Chuyển thành numpy array
        labels = torch.from_numpy(scene_labels).float().to(device)  # Chuyển thành tensor và đưa vào device
       
        optimizer.zero_grad()
        outputs = classifier_model(visual_data, text_data, audio_data)
        
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {running_loss / len(train_feats_loader):.4f}")

    # ============================= Validation ==============================
    classifier_model.eval()
    val_loss = 0.0
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for visual_data, text_data, audio_data, labels in val_feats_loader:
            # Chuyển dữ liệu sang thiết bị
            visual_data, text_data, audio_data, labels = visual_data.to(device), text_data.to(device), audio_data.to(device), labels.to(device)
            
            # Hợp nhất nhãn
            scene_labels = np.array(get_interaction_scene(labels.cpu().numpy()))
            labels = torch.from_numpy(scene_labels).float().to(device)
           
            outputs = classifier_model(visual_data, text_data, audio_data)

            loss = criterion(outputs, labels)
            val_loss += loss.item()
            outputs = torch.sigmoid(outputs)
            
            # Lưu dự đoán và nhãn
            val_preds.append(outputs.cpu().numpy())  # Store predictions
            val_labels.append(labels.cpu().numpy())  # Store ground truth

    # Convert lists of predictions and labels into numpy arrays
    val_preds = np.concatenate(val_preds, axis=0)
    val_labels = np.concatenate(val_labels, axis=0)
    
    # Convert logits to binary predictions using a threshold of 0.5
    val_preds = (val_preds > 0.6).astype(int)
    val_labels = val_labels.astype(int)

    # Tính các chỉ số đánh giá
    # Macro Precision, Recall, F1
    precision_macro = precision_score(val_labels, val_preds, average='macro', zero_division=0)
    recall_macro = recall_score(val_labels, val_preds, average='macro', zero_division=0)
    f1_macro = f1_score(val_labels, val_preds, average='macro', zero_division=0)
    
    # Micro Precision, Recall, F1
    precision_micro = precision_score(val_labels, val_preds, average='micro', zero_division=0)
    recall_micro = recall_score(val_labels, val_preds, average='micro', zero_division=0)
    f1_micro = f1_score(val_labels, val_preds, average='micro', zero_division=0)
   

    print(f"Macro Precision: {precision_macro:.4f} - Recall: {recall_macro:.4f} - F1-score: {f1_macro:.4f}")
    print(f"Micro Precision: {precision_micro:.4f} - Recall: {recall_micro:.4f} - F1-score: {f1_micro:.4f}")
    print("=============================================================")

torch.save(classifier_model.state_dict(), f"{save_dir}best_model.pth")

Epoch [1/80] - Loss: 2.2283
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [2/80] - Loss: 1.8351
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [3/80] - Loss: 1.8093
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [4/80] - Loss: 1.7913
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [5/80] - Loss: 1.8113
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [6/80] - Loss: 1.7431
Macro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.5789
Micro Precision: 0.4605 - Recall: 1.0000 - F1-score: 0.6306
Epoch [7/80] - Loss: 1.7508
Macro Precision: 0.4623 - Recall: 0.9847 - F1-score: 0.5798
Micro Precision: 0.4668 

## Load model

In [None]:
classifier_model.load_state_dict(torch.load('/kaggle/input/checkpoint/best_model_3f.pth', weights_only=True))
classifier_model.to(device)

# Evaluate in test set in scene level

In [121]:
features_path = "/kaggle/input/checkpoint/full_v_t_a/test_full_v_t_a.pkl"
test_video_paths, test_visual_features, test_text_paths, test_text_features, test_audio_paths, test_audio_features, test_labels = load_features(features_path)
test_scene_feats = group_features(test_video_paths, test_visual_features, test_text_paths, test_text_features, test_audio_paths, test_audio_features, test_labels)

with open("/kaggle/working/grouped_test_features.pkl", "wb") as f:
    pickle.dump(test_scene_feats, f)
print(f"Grouped features saved")

Grouped features saved


In [122]:
for scene_name, scene_data in test_scene_feats.items():
    scene_data['labels'] = F.one_hot(scene_data['labels'], num_classes=7)

In [123]:
test_feats_padded = pad_scene_level(test_scene_feats)

In [124]:
visual_features = []
text_features = []
audio_features = []
labels = []
scene_names = []

for scene_name, scene in test_feats_padded.items():
    scene_names.append(scene_name)
    visual_features.append(scene['visual_features'])
    text_features.append(scene['text_features'])
    audio_features.append(scene['audio_features'])
    labels.append(scene['labels'])

# Chuyển các list thành tensor
visual_features = torch.stack(visual_features)
text_features = torch.stack(text_features)
audio_features = torch.stack(audio_features)
labels = torch.stack(labels)

# Tạo TensorDataset từ các đặc trưng visual, text và labels
test_feats = TensorDataset(visual_features, text_features, audio_features, labels)

In [125]:
test_feats_loader = DataLoader(test_feats, batch_size=16, shuffle=False)
print(f"Test set: {len(test_feats_loader.dataset)} scenes")

Test set: 131 scenes


In [126]:
test_preds = []
test_labels = []

classifier_model.eval()

with torch.no_grad(): 
    #for visual_data, text_data, audio_data, labels in test_feats_loader:
    for idx, (visual_data, text_data, audio_data, labels) in enumerate(test_feats_loader):
        
        visual_data, text_data, audio_data, labels = visual_data.to(device), text_data.to(device), audio_data.to(device), labels.to(device)
        
        scene_labels = np.array(get_interaction_scene(labels.cpu().numpy()))  # Hợp nhất nhãn
        labels = torch.from_numpy(scene_labels).float().to(device)  # Chuyển thành tensor và đưa vào device

        outputs = classifier_model(visual_data, text_data, audio_data)
        outputs = torch.sigmoid(outputs)
        test_preds.append(outputs.cpu().numpy())  
        test_labels.append(labels.cpu().numpy())

# Chuyển các danh sách về numpy arrays
test_preds = np.concatenate(test_preds, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

test_preds_scene = (test_preds > 0.6).astype(int)
test_labels_scene = test_labels.astype(int)

'''for i, scene_name in enumerate(scene_names):
    print(f"Scene Name: {scene_name}")
    print(f"Prediction: {test_preds[i]}")
    print(f"Ground Truth: {test_labels[i]}")
    print("===================================")'''



In [127]:
#Tính các chỉ số đánh giá trên bộ test
precision_macro = precision_score(test_labels_scene, test_preds_scene, average='macro')
recall_macro = recall_score(test_labels_scene, test_preds_scene, average='macro')
f1_macro = f1_score(test_labels_scene, test_preds_scene, average='macro')

precision_micro = precision_score(test_labels_scene, test_preds_scene, average='micro')
recall_micro = recall_score(test_labels_scene, test_preds_scene, average='micro')
f1_micro = f1_score(test_labels_scene, test_preds_scene, average='micro')

# In kết quả
print(f"Test Macro Precision: {precision_macro:.4f} - Recall: {recall_macro:.4f} - F1-score: {f1_macro:.4f}")
print(f"Test Micro Precision: {precision_micro:.4f} - Recall: {recall_micro:.4f} - F1-score: {f1_micro:.4f}")

Test Macro Precision: 0.5542 - Recall: 0.6201 - F1-score: 0.5694
Test Micro Precision: 0.7158 - Recall: 0.8088 - F1-score: 0.7595
