In [6]:
import sejin_tool_copy as tl

import wandb

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

In [58]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.optim as optim

import os
import json
from sklearn.preprocessing import LabelEncoder

import numpy as np
import math


def get_word_list(num_folders_start, num_folders_end):
    folder_path = 'morpheme/01'

    # 단어들을 저장할 리스트
    word_list = []
    
    # 파일 이름 얻어오기
    file_names = [f for f in os.listdir(folder_path) if f.endswith('.json') and "F_morpheme" in f]

    # 파일 이름을 번호 순서대로 정렬하기
    file_names.sort(key=lambda x: int(x.split('_')[2][4:]))

    file_names = file_names[:500]
    
    for idx in range(num_folders_start, num_folders_end + 1):
        for filename in file_names:
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                # 'data' 키 안의 요소들 순회
                for item in data['data']:
                    for attribute in item['attributes']:
                        word_list.append(attribute['name'])
                    

    # Label Encoder 초기화 및 학습
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(word_list)
    
    # 인코딩된 라벨과 원래 라벨 매칭하여 출력
    i = 0
    for original, encoded in zip(word_list, encoded_labels):
        print(f"{i}  Original Label: {original}, Encoded Label: {encoded}")
        i += 1
    
    return encoded_labels


def get_sequence_files(num_folders_start, num_folders_end):
    base_folder_path = 'keypoints'

    # 전체 시퀀스를 저장할 리스트
    sequence_files = []

    for idx in range(num_folders_start, num_folders_end + 1):

        folder_path = os.path.join(base_folder_path, f'{idx:02d}')
        
        # 각 폴더의 파일 이름을 저장할 리스트
        # folder_files = []
        
        # 파일 이름 얻어오기
        file_names = [f for f in os.listdir(folder_path) if "F" in f]
        
        # 파일 이름을 번호 순서대로 정렬하기
        file_names.sort(key=lambda x: int(x.split('_')[2][4:]))
        
        file_names = file_names[:500]
        
        for filename in file_names:
            file_path = os.path.join(folder_path, filename)
            
            json_names = [f for f in os.listdir(file_path) if "F" in f]       
            json_names.sort(key=lambda x: int(x.split('_')[5]))
            json_paths = []
            
            for i, jsonname in enumerate(json_names):
                if i % 3 == 0:
                    json_path = os.path.join(file_path, jsonname)
                    json_paths.append(json_path)
            
            sequence_files.append(json_paths)
        
    return sequence_files
                

def extract_keypoints(json_data):
    
    def append_coordinates(keypoints_list, array, dimensions, offset=0):
        step = dimensions + 1  # dimensions + 1 because of the confidence score
        for i in range(0, len(keypoints_list), step):
            idx = i // step + offset
            if dimensions == 2:
                array[idx] = [keypoints_list[i], keypoints_list[i + 1], 0]
            elif dimensions == 3:
                array[idx] = [keypoints_list[i], keypoints_list[i + 1], keypoints_list[i + 2]]
                
                
    keypoint_types_2d = ['face_keypoints_2d', 'pose_keypoints_2d', 'hand_left_keypoints_2d', 'hand_right_keypoints_2d']
    keypoint_types_3d = ['face_keypoints_3d', 'pose_keypoints_3d', 'hand_left_keypoints_3d', 'hand_right_keypoints_3d']
    
    #키포인트 개수 계산
    num_keypoints_2d = sum(len(json_data['people'][key]) // 3 for key in keypoint_types_2d if key in json_data['people'])
    num_keypoints_3d = sum(len(json_data['people'][key]) // 4 for key in keypoint_types_3d if key in json_data['people'])

    if num_keypoints_3d == 0: 
        #3d가 비어있을 때
        #numpy배열 초기화
        keypoints_2d = np.zeros((num_keypoints_2d, 3))  # (x, y, 0)
        
        offset_2d = 0
        
        for key in keypoint_types_2d:
            if key in json_data['people']:
                append_coordinates(json_data['people'][key], keypoints_2d, dimensions=2, offset=offset_2d)
                offset_2d += len(json_data['people'][key]) // 3
                
        keypoints = keypoints_2d
        
    else:
        #3d가 있을 때
        #키포인트 개수 계산

        #numpy배열 초기화
        keypoints_3d = np.zeros((num_keypoints_3d, 3))  # (x, y, z)

        offset_3d = 0

        for key in keypoint_types_3d:
            if key in json_data['people']:
                append_coordinates(json_data['people'][key], keypoints_3d, dimensions=3, offset=offset_3d)
                offset_3d += len(json_data['people'][key]) // 4

        keypoints = keypoints_3d
    
    return keypoints

def collate_fn(batch):    
    # batch는 keypoints와 labels의 튜플로 구성된 리스트
    keypoints, labels, idx = zip(*batch)
        
    # 각 keypoints 리스트에서 face, pose, left_hand, right_hand를 각각 분리
    face_keypoints = [k[0] for k in keypoints]
    pose_keypoints = [k[1] for k in keypoints]
    left_hand_keypoints = [k[2] for k in keypoints]
    right_hand_keypoints = [k[3] for k in keypoints]
    
    # keypoints는 3D 텐서이므로, 텐서 The line `리스트에서 시퀀스 길이(120)를 추출
    face_keypoints_padded = pad_sequence([k.permute(1, 0, 2) for k in face_keypoints], batch_first=True, padding_value=0)
    pose_keypoints_padded = pad_sequence([k.permute(1, 0, 2) for k in pose_keypoints], batch_first=True, padding_value=0)
    left_hand_keypoints_padded = pad_sequence([k.permute(1, 0, 2) for k in left_hand_keypoints], batch_first=True, padding_value=0)
    right_hand_keypoints_padded = pad_sequence([k.permute(1, 0, 2) for k in right_hand_keypoints], batch_first=True, padding_value=0)
    
    # 패딩 후 다시 원래 차원으로 복원
    face_keypoints_padded = face_keypoints_padded.permute(0, 2, 1, 3)
    pose_keypoints_padded = pose_keypoints_padded.permute(0, 2, 1, 3)
    left_hand_keypoints_padded = left_hand_keypoints_padded.permute(0, 2, 1, 3)
    right_hand_keypoints_padded = right_hand_keypoints_padded.permute(0, 2, 1, 3)
    
    # 각 시퀀스의 길이를 계산 (여기서는 모두 120이 동일함)
    lengths = torch.tensor([k.size(1) for k in keypoints[0]])
    
    
    # labels를 tensor로 변환
    labels = torch.tensor(labels)
    
    return (face_keypoints_padded,pose_keypoints_padded,left_hand_keypoints_padded,right_hand_keypoints_padded), labels, lengths, idx


class SignLanguageDataset(Dataset):
    def __init__(self, sequence_files, labels):
        self.data = []
        self.labels = labels
        for files in sequence_files:
            sequence = []
            for file in files:
                with open(file, 'r') as f:
                    json_data = json.load(f)
                    keypoints = extract_keypoints(json_data)
                    sequence.append(keypoints)
            self.data.append(sequence)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sequence = torch.tensor(self.data[idx], dtype=torch.float32)        
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        sequence = sequence.permute(2, 0, 1)
        
        # 데이터 분리: 얼굴, 왼손, 오른손, 포즈 (각각의 키포인트 개수는 데이터에 따라 설정)
        face_sequence = sequence[:, :, :70]  # 얼굴 keypoints
        pose_sequence = sequence[:, :, 70:95]  # 포즈 keypoints
        left_hand_sequence = sequence[:, :, 95:116]  # 왼손 keypoints
        right_hand_sequence = sequence[:, :, 116:137]  # 오른손 keypoints
        
        mean = torch.mean(face_sequence)
        std = torch.std(face_sequence)
        face_sequence = (face_sequence - mean) / std
        
        mean = torch.mean(pose_sequence)
        std = torch.std(pose_sequence)
        pose_sequence = (pose_sequence - mean) / std
        
        mean = torch.mean(left_hand_sequence)
        std = torch.std(left_hand_sequence)
        left_hand_sequence = (left_hand_sequence - mean) / std
        
        mean = torch.mean(right_hand_sequence)
        std = torch.std(right_hand_sequence)
        right_hand_sequence = (right_hand_sequence - mean) / std
        
        
        # 패딩 처리: 길이를 70으로 맞춤
        pose_sequence = F.pad(pose_sequence, (0, 70 - pose_sequence.size(2)), "constant", 0)
        left_hand_sequence = F.pad(left_hand_sequence, (0, 70 - left_hand_sequence.size(2)), "constant", 0)
        right_hand_sequence = F.pad(right_hand_sequence, (0, 70 - right_hand_sequence.size(2)), "constant", 0)

        # 마스크 생성: 패딩된 부분은 True, 나머지는 False
        pose_mask = pose_sequence.sum(dim=1) == 0  # 패딩된 부분은 모든 값이 0이므로 합이 0인 부분이 패딩된 부분
        left_hand_mask = left_hand_sequence.sum(dim=1) == 0
        right_hand_mask = right_hand_sequence.sum(dim=1) == 0
    
        return (face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence), label, idx

class MultiEncoderTransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, num_classes):
        super(MultiEncoderTransformerModel, self).__init__()
        
        # Face Encoder
        self.face_input_fc = nn.Linear(input_dim, model_dim)
        self.face_positional_encoding = PositionalEncoding(dim_model=model_dim, dropout_p=0.1, max_len=500)
        face_encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.face_transformer_encoder = nn.TransformerEncoder(face_encoder_layers, num_layers)
        
        # Pose Encoder
        self.pose_input_fc = nn.Linear(input_dim, model_dim)
        self.pose_positional_encoding = PositionalEncoding(dim_model=model_dim, dropout_p=0.1, max_len=500)
        pose_encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.pose_transformer_encoder = nn.TransformerEncoder(pose_encoder_layers, num_layers)
        
        # Left Hand Encoder
        self.left_hand_input_fc = nn.Linear(input_dim, model_dim)
        self.left_hand_positional_encoding = PositionalEncoding(dim_model=model_dim, dropout_p=0.1, max_len=500)
        left_hand_encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.left_hand_transformer_encoder = nn.TransformerEncoder(left_hand_encoder_layers, num_layers)
        
        # Right Hand Encoder
        self.right_hand_input_fc = nn.Linear(input_dim, model_dim)
        self.right_hand_positional_encoding = PositionalEncoding(dim_model=model_dim, dropout_p=0.1, max_len=500)
        right_hand_encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.right_hand_transformer_encoder = nn.TransformerEncoder(right_hand_encoder_layers, num_layers)
        
        # 최종 결합 후 클래스 분류
        self.fc = nn.Linear(model_dim * 4, num_classes)  # 네 인코더 출력 결합

    def forward(self, face_input, pose_input, left_hand_input, right_hand_input, face_src_key_padding_mask=None, pose_src_key_padding_mask=None, left_src_key_padding_mask=None, right_src_key_padding_mask=None):
        # Face Encoder
        face_x = self.face_input_fc(face_input)
        face_x = self.face_positional_encoding(face_x)
        face_x = self.face_transformer_encoder(face_x, src_key_padding_mask=face_src_key_padding_mask)
        face_x = face_x.mean(dim=1)
        
        # Pose Encoder
        pose_x = self.pose_input_fc(pose_input)
        pose_x = self.pose_positional_encoding(pose_x)
        pose_x = self.pose_transformer_encoder(pose_x, src_key_padding_mask=pose_src_key_padding_mask)
        pose_x = pose_x.mean(dim=1)
        
        # Left Hand Encoder
        left_hand_x = self.left_hand_input_fc(left_hand_input)
        left_hand_x = self.left_hand_positional_encoding(left_hand_x)
        left_hand_x = self.left_hand_transformer_encoder(left_hand_x, src_key_padding_mask=left_src_key_padding_mask)
        left_hand_x = left_hand_x.mean(dim=1)
        
        # Right Hand Encoder
        right_hand_x = self.right_hand_input_fc(right_hand_input)
        right_hand_x = self.right_hand_positional_encoding(right_hand_x)
        right_hand_x = self.right_hand_transformer_encoder(right_hand_x, src_key_padding_mask=right_src_key_padding_mask)
        right_hand_x = right_hand_x.mean(dim=1)
        
        # 네 인코더의 출력 결합
        combined_x = torch.cat((face_x, left_hand_x, right_hand_x, pose_x), dim=-1)
        
        # Fully Connected Layer
        output = self.fc(combined_x)
        
        return output


class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len=5000):
        super().__init__()
        
        self.dropout = nn.Dropout(dropout_p)
        
        # 최대 길이에 대한 Positional Encoding 생성
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, ...
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model)
        
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Positional Encoding을 모델의 버퍼로 등록
        pos_encoding = pos_encoding.unsqueeze(0)
        self.register_buffer("pos_encoding", pos_encoding)
    
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        seq_len = token_embedding.size(1)
        pos_encoding = self.pos_encoding[:, :seq_len, :]
        return self.dropout(token_embedding + pos_encoding)


class EarlyStopping:
    def __init__(self, patience=20, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [64]:
train_encoded_labels = get_word_list(1, 4)
train_sequence_files = get_sequence_files(1, 4)


val_encoded_labels = get_word_list(5, 5)
val_sequence_files = get_sequence_files(5, 5)

train_dataset = SignLanguageDataset(train_sequence_files, train_encoded_labels)
val_dataset = SignLanguageDataset(val_sequence_files, val_encoded_labels)

device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

0  Original Label: 고민, Encoded Label: 53
1  Original Label: 뻔뻔, Encoded Label: 248
2  Original Label: 수어, Encoded Label: 285
3  Original Label: 남아, Encoded Label: 133
4  Original Label: 눈, Encoded Label: 140
5  Original Label: 독신, Encoded Label: 166
6  Original Label: 음료수, Encoded Label: 362
7  Original Label: 발가락, Encoded Label: 211
8  Original Label: 슬프다, Encoded Label: 291
9  Original Label: 자극, Encoded Label: 374
10  Original Label: 안타깝다, Encoded Label: 309
11  Original Label: 어색하다, Encoded Label: 317
12  Original Label: 여아, Encoded Label: 323
13  Original Label: 외국인, Encoded Label: 346
14  Original Label: 영아, Encoded Label: 335
15  Original Label: 신사, Encoded Label: 298
16  Original Label: 뉴질랜드, Encoded Label: 141
17  Original Label: 나사렛대학교, Encoded Label: 125
18  Original Label: 알아서, Encoded Label: 310
19  Original Label: 장애인, Encoded Label: 381
20  Original Label: 열아홉번째, Encoded Label: 334
21  Original Label: 침착, Encoded Label: 428
22  Original Label: 성실, Encoded Label: 269
23  

In [65]:
# 모델 초기화
input_dim = 70 * 3 # 각 키포인트의 2D 좌표(2)와 3D 좌표(3)를 사용
model_dim = 512  # 모델 차원
num_heads = 8  # 멀티헤드 어텐션의 헤드 수
num_layers = 3  # Transformer 레이어 수
num_classes = 2771  # 출력 클래스 수

learning_rate = 0.0001

num_epochs = 20
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,  collate_fn=collate_fn)

model = MultiEncoderTransformerModel(input_dim, model_dim, num_heads, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [66]:
batch = next(iter(train_loader))

sequences, labels, lengths, idx = batch

print(f"Batch Sequence Shapes: {[seq.shape for seq in sequences]}")
print(f"idx: {idx}")
print(f"origin_word: {[train_encoded_labels[i] for i in idx]}")
print(f"Batch Labels: {labels}")
print(f"Batch Lengths: {lengths}")


Batch Sequence Shapes: [torch.Size([64, 3, 66, 70]), torch.Size([64, 3, 66, 70]), torch.Size([64, 3, 66, 70]), torch.Size([64, 3, 66, 70])]
idx: (1464, 22, 137, 1612, 809, 1298, 158, 1952, 1412, 559, 1452, 1646, 1807, 1888, 1311, 1932, 893, 698, 1220, 1603, 1393, 914, 1780, 98, 1257, 1080, 722, 1273, 1038, 328, 635, 992, 477, 383, 652, 1297, 976, 1016, 550, 1043, 1554, 804, 867, 1861, 1406, 1150, 538, 631, 415, 1560, 1427, 526, 1851, 1166, 108, 27, 650, 1438, 1171, 1013, 1582, 307, 824, 1201)
origin_word: [np.int64(148), np.int64(269), np.int64(70), np.int64(405), np.int64(15), np.int64(205), np.int64(347), np.int64(50), np.int64(395), np.int64(372), np.int64(50), np.int64(186), np.int64(404), np.int64(197), np.int64(68), np.int64(104), np.int64(69), np.int64(33), np.int64(184), np.int64(292), np.int64(69), np.int64(84), np.int64(358), np.int64(251), np.int64(420), np.int64(135), np.int64(233), np.int64(436), np.int64(225), np.int64(286), np.int64(45), np.int64(185), np.int64(201), np.

In [67]:
for epoch in range(num_epochs):
    model.train()
    for (face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence), labels, lengths, idx in train_loader:

    
        face_sequence = face_sequence.to(device)
        pose_sequence = pose_sequence.to(device)
        left_hand_sequence = left_hand_sequence.to(device)
        right_hand_sequence = right_hand_sequence.to(device)
        labels = labels.to(device)
        
        # 마스킹 생성
        face_src_key_padding_mask = (face_sequence.sum(dim=-1) == 0)
        face_src_key_padding_mask = face_src_key_padding_mask.any(dim=1).to(device)
        pose_src_key_padding_mask = (pose_sequence.sum(dim=-1) == 0)
        pose_src_key_padding_mask = pose_src_key_padding_mask.any(dim=1).to(device)
        left_src_key_padding_mask = (left_hand_sequence.sum(dim=-1) == 0)
        left_src_key_padding_mask = left_src_key_padding_mask.any(dim=1).to(device)
        right_src_key_padding_mask = (right_hand_sequence.sum(dim=-1) == 0)
        right_src_key_padding_mask = right_src_key_padding_mask.any(dim=1).to(device)

        # 입력 텐서 변환: [batch_size, 3, seq_len, num_joints] -> [batch_size, seq_len, 3 * num_joints]
        batch_size, coord, seq_len, num_joints = face_sequence.size()
        face_sequence = face_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        face_sequence = face_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = pose_sequence.size()
        pose_sequence = pose_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        pose_sequence = pose_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = left_hand_sequence.size()
        left_hand_sequence = left_hand_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        left_hand_sequence = left_hand_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = right_hand_sequence.size()
        right_hand_sequence = right_hand_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        right_hand_sequence = right_hand_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        

        # Forward pass
        outputs = model(face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence, face_src_key_padding_mask, pose_src_key_padding_mask, left_src_key_padding_mask, right_src_key_padding_mask)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
    
    print("epoch:", epoch)
    print(f' Loss: {loss.item():.4f}')
        

epoch: 0
 Loss: 6.7982
epoch: 1
 Loss: 6.4162
epoch: 2
 Loss: 6.4639
epoch: 3
 Loss: 6.4326
epoch: 4
 Loss: 6.4034
epoch: 5
 Loss: 6.0066
epoch: 6
 Loss: 6.4408
epoch: 7
 Loss: 6.1401
epoch: 8
 Loss: 6.0992
epoch: 9
 Loss: 5.6206
epoch: 10
 Loss: 5.3548
epoch: 11
 Loss: 5.0769
epoch: 12
 Loss: 5.1882
epoch: 13
 Loss: 4.8830
epoch: 14
 Loss: 4.8014
epoch: 15
 Loss: 4.6861
epoch: 16
 Loss: 4.4659
epoch: 17
 Loss: 4.3497
epoch: 18
 Loss: 3.7853
epoch: 19
 Loss: 3.6928


: 

In [79]:

# Validation loop
model.eval()
val_loss = 0

def create_padding_mask(sequences, pad_token=0):
    return (sequences == pad_token)

with torch.no_grad():
    for (face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence), labels, lengths in val_loader:
        
        mean = torch.mean(face_sequence)
        std = torch.std(face_sequence)
        face_sequence = (face_sequence - mean) / std
        
        mean = torch.mean(pose_sequence)
        std = torch.std(pose_sequence)
        pose_sequence = (pose_sequence - mean) / std
        
        mean = torch.mean(left_hand_sequence)
        std = torch.std(left_hand_sequence)
        left_hand_sequence = (left_hand_sequence - mean) / std
        
        mean = torch.mean(right_hand_sequence)
        std = torch.std(right_hand_sequence)
        right_hand_sequence = (right_hand_sequence - mean) / std
        
        face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence = face_sequence.to(device), pose_sequence.to(device), left_hand_sequence.to(device), right_hand_sequence.to(device)
        labels = labels.to(device)

        # 입력 텐서 변환: [batch_size, 3, seq_len, num_joints] -> [batch_size, seq_len, 3 * num_joints]
        batch_size, coord, seq_len, num_joints = face_sequence.size()
        face_sequence = face_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        face_sequence = face_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = pose_sequence.size()
        pose_sequence = pose_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        pose_sequence = pose_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = left_hand_sequence.size()
        left_hand_sequence = left_hand_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        left_hand_sequence = left_hand_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        batch_size, coord, seq_len, num_joints = right_hand_sequence.size()
        right_hand_sequence = right_hand_sequence.permute(0, 2, 3, 1).contiguous()  # [batch_size, seq_len, num_joints, coord]
        right_hand_sequence = right_hand_sequence.view(batch_size, seq_len, -1)  # [batch_size, seq_len, num_joints * coord]
        
        # 패딩 마스크 생성
        face_src_key_padding_mask = create_padding_mask(face_sequence[:,:,0])
        face_src_key_padding_mask = face_src_key_padding_mask.to(device)
        pose_src_key_padding_mask = create_padding_mask(pose_sequence[:,:,0])
        pose_src_key_padding_mask = pose_src_key_padding_mask.to(device)
        left_src_key_padding_mask = create_padding_mask(left_hand_sequence[:,:,0])
        left_src_key_padding_mask = left_src_key_padding_mask.to(device)
        right_src_key_padding_mask = create_padding_mask(right_hand_sequence[:,:,0])
        right_src_key_padding_mask = right_src_key_padding_mask.to(device)

        outputs = model(face_sequence, pose_sequence, left_hand_sequence, right_hand_sequence, face_src_key_padding_mask, pose_src_key_padding_mask, left_src_key_padding_mask, right_src_key_padding_mask)
        # outputs = model(sequences, src_key_padding_mask=src_key_padding_mask)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        val_loss /= len(val_loader)

        # 에폭당 loss 값을 기록합니다.
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}')

Epoch [10/10], Loss: 6.2703, Val Loss: 0.0502
Epoch [10/10], Loss: 6.1798, Val Loss: 0.0498
Epoch [10/10], Loss: 6.2312, Val Loss: 0.0502
Epoch [10/10], Loss: 6.2064, Val Loss: 0.0501
Epoch [10/10], Loss: 6.3671, Val Loss: 0.0513
Epoch [10/10], Loss: 6.2023, Val Loss: 0.0500
Epoch [10/10], Loss: 6.1910, Val Loss: 0.0499
Epoch [10/10], Loss: 5.9558, Val Loss: 0.0480
Epoch [10/10], Loss: 6.1560, Val Loss: 0.0496
Epoch [10/10], Loss: 6.1087, Val Loss: 0.0493
Epoch [10/10], Loss: 5.9922, Val Loss: 0.0483
Epoch [10/10], Loss: 6.4378, Val Loss: 0.0519
Epoch [10/10], Loss: 6.0837, Val Loss: 0.0491
Epoch [10/10], Loss: 6.1323, Val Loss: 0.0495
Epoch [10/10], Loss: 6.5607, Val Loss: 0.0529
Epoch [10/10], Loss: 6.2301, Val Loss: 0.0503
Epoch [10/10], Loss: 6.1727, Val Loss: 0.0498
Epoch [10/10], Loss: 6.1926, Val Loss: 0.0499
Epoch [10/10], Loss: 6.1358, Val Loss: 0.0495
Epoch [10/10], Loss: 6.1682, Val Loss: 0.0497
Epoch [10/10], Loss: 6.4028, Val Loss: 0.0516
Epoch [10/10], Loss: 6.0642, Val L