## 필요한 모듈 선언 ##

In [20]:
# 필요한 모듈 선언
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
import json
from sklearn.preprocessing import LabelEncoder
import math

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
print('device : ', device)

device :  cuda


In [3]:
folder_path = 'morpheme/01'

# 단어들을 저장할 리스트
word_list = []

# 파일 이름 얻어오기
file_names = [f for f in os.listdir(folder_path) if f.endswith('.json') and "F_morpheme" in f]

# 파일 이름을 번호 순서대로 정렬하기
file_names.sort(key=lambda x: int(x.split('_')[2][4:]))

for filename in file_names:
    file_path = os.path.join(folder_path, filename)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
        # 'data' 키 안의 요소들 순회
        for item in data['data']:
            for attribute in item['attributes']:
                word_list.append(attribute['name'])

# 결과 출력 
#print(len(word_list))
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(word_list)
# label_mapping 딕셔너리 생성
label_mapping = {f"NIA_SL_WORD{str(i+1).zfill(4)}_REAL01_F": encoded_labels[i] for i,word in enumerate(word_list)}
#print(label_mapping)

In [40]:
len(set(word_list[:100]))

100

In [4]:
print(label_encoder.inverse_transform([2632]))

['하는수없다']


## Dataset 클래스 선언 ##

In [4]:
# 임시!!!!!
class SignLanguageDataset(Dataset):
    def __init__(self, data_dir, folder_to_label):
        self.data_dir = data_dir
        self.folder_to_label = folder_to_label
        self.data, self.labels = self.load_data()

    def load_data(self):
        file_list = []
        labels = []
        for folder_name in os.listdir(self.data_dir):
            if folder_name.endswith("F") and folder_name in self.folder_to_label:  # "F"로 끝나는 폴더만 처리
                label = self.folder_to_label[folder_name]
                label = int(label)
                label_name = label_encoder.inverse_transform([label])
                folder_path = os.path.join(self.data_dir, folder_name)
                if os.path.isdir(folder_path):
                    json_files = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith('.json')]
                    file_list.append(json_files)
                    labels.append(label)
                    print(f"Label: {label_name},label_num: {label} Folder: {folder_name}, Frame count: {len(json_files)}")
        return file_list, labels

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        all_keypoints = []
        
        json_file_list = self.data[index]
        label = self.labels[index]
        for file_path in json_file_list:
            with open(file_path, 'r') as f:
                data = json.load(f)
            frame_data = data['people']
            if frame_data:
                keypoints_2d = np.array(frame_data['face_keypoints_2d'] +
                                        frame_data['pose_keypoints_2d'] +
                                        frame_data['hand_left_keypoints_2d'] +
                                        frame_data['hand_right_keypoints_2d'])
                keypoints_2d = keypoints_2d.reshape(-1, 3)[:, :2].flatten()  # (num_keypoints * 2,)
                
                keypoints_3d = np.array(frame_data['face_keypoints_3d'] +
                                        frame_data['pose_keypoints_3d'] +
                                        frame_data['hand_left_keypoints_3d'] +
                                        frame_data['hand_right_keypoints_3d'])
                keypoints_3d = keypoints_3d.reshape(-1, 4)[:, :3].flatten() 
                
                keypoints = np.concatenate((keypoints_2d, keypoints_3d))
                all_keypoints.append(keypoints)
        
        all_keypoints = np.array(all_keypoints)  # (num_frames, num_keypoints * (2 + 3))
        return torch.tensor(all_keypoints, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [8]:
from torch.nn.utils.rnn import pad_sequence
# Collate 함수 정의
def collate_fn(batch):
    keypoints, labels = zip(*batch)
    keypoints = [torch.tensor(k) for k in keypoints]
    labels = torch.tensor(labels)
    keypoints_padded = pad_sequence(keypoints, batch_first=True, padding_value=0)
    
    lengths = torch.tensor([len(k) for k in keypoints])
    return keypoints_padded, labels, lengths

In [6]:
root_dir = "keypoints/01"

In [6]:
print(os.listdir(root_dir))
label = int((''.join(filter(str.isdigit, 'NIA_SL_WORD0101_REAL01_L')))[0:4])


['NIA_SL_WORD1731_REAL01_L', 'NIA_SL_WORD2775_REAL01_U', 'NIA_SL_WORD1948_REAL01_R', 'NIA_SL_WORD1083_REAL01_R', 'NIA_SL_WORD2918_REAL01_R', 'NIA_SL_WORD1709_REAL01_D', 'NIA_SL_WORD1075_REAL01_U', 'NIA_SL_WORD0223_REAL01_R', 'NIA_SL_WORD2794_REAL01_R', 'NIA_SL_WORD0687_REAL01_R', 'NIA_SL_WORD0019_REAL01_D', 'NIA_SL_WORD0438_REAL01_L', 'NIA_SL_WORD0293_REAL01_L', 'NIA_SL_WORD2787_REAL01_R', 'NIA_SL_WORD2515_REAL01_U', 'NIA_SL_WORD2600_REAL01_D', 'NIA_SL_WORD1123_REAL01_D', 'NIA_SL_WORD1025_REAL01_U', 'NIA_SL_WORD1923_REAL01_F', 'NIA_SL_WORD1533_REAL01_L', 'NIA_SL_WORD1081_REAL01_L', 'NIA_SL_WORD2814_REAL01_L', 'NIA_SL_WORD2298_REAL01_U', 'NIA_SL_WORD2050_REAL01_D', 'NIA_SL_WORD1890_REAL01_L', 'NIA_SL_WORD0141_REAL01_U', 'NIA_SL_WORD0973_REAL01_F', 'NIA_SL_WORD2832_REAL01_F', 'NIA_SL_WORD0164_REAL01_L', 'NIA_SL_WORD0462_REAL01_U', 'NIA_SL_WORD2981_REAL01_D', 'NIA_SL_WORD2439_REAL01_L', 'NIA_SL_WORD0457_REAL01_D', 'NIA_SL_WORD0958_REAL01_F', 'NIA_SL_WORD1274_REAL01_D', 'NIA_SL_WORD2596_RE

In [9]:
batch_size = 1
dataset = SignLanguageDataset(root_dir, label_mapping)
dataloader = DataLoader(dataset, batch_size, collate_fn=collate_fn)  
print(dataloader)


Label: ['하는수없다'],label_num: 2632 Folder: NIA_SL_WORD1923_REAL01_F, Frame count: 116
Label: ['못견디다'],label_num: 786 Folder: NIA_SL_WORD0973_REAL01_F, Frame count: 114
Label: ['이백'],label_num: 1994 Folder: NIA_SL_WORD2832_REAL01_F, Frame count: 95
Label: ['칩거'],label_num: 2496 Folder: NIA_SL_WORD0958_REAL01_F, Frame count: 107
Label: ['임기응변'],label_num: 2089 Folder: NIA_SL_WORD2310_REAL01_F, Frame count: 147
Label: ['남원시청'],label_num: 493 Folder: NIA_SL_WORD1857_REAL01_F, Frame count: 135
Label: ['잘못하다'],label_num: 2122 Folder: NIA_SL_WORD1336_REAL01_F, Frame count: 143
Label: ['소불고기'],label_num: 1383 Folder: NIA_SL_WORD0097_REAL01_F, Frame count: 115
Label: ['백억'],label_num: 955 Folder: NIA_SL_WORD2991_REAL01_F, Frame count: 102
Label: ['흑백'],label_num: 2755 Folder: NIA_SL_WORD0349_REAL01_F, Frame count: 101
Label: ['불알'],label_num: 1119 Folder: NIA_SL_WORD0385_REAL01_F, Frame count: 99
Label: ['구'],label_num: 295 Folder: NIA_SL_WORD2640_REAL01_F, Frame count: 89
Label: ['희미'],label_num

In [10]:
for batch_idx, (inputs, labels, lengths) in enumerate(dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Inputs: {inputs}")
    print(f"Labels: {labels}")
    print(f"Lengths: {lengths}")
    print(f"Inputs shape: {inputs.shape}")
    print(f"Labels shape: {labels.shape}")
    
    # 배치 2개만 출력하고 종료
    if batch_idx == 1:
        break

Batch 0:
Inputs: tensor([[[ 9.0718e+02,  2.3556e+02,  9.0718e+02,  ..., -1.8762e-01,
           5.0495e-01,  2.4338e+00],
         [ 9.1722e+02,  2.3588e+02,  9.1722e+02,  ..., -1.2512e-02,
           1.3704e-01,  2.1267e+00],
         [ 9.0717e+02,  2.3335e+02,  9.0717e+02,  ..., -1.9075e-01,
           5.0099e-01,  2.4050e+00],
         ...,
         [ 9.1302e+02,  2.4398e+02,  9.1382e+02,  ...,  5.0804e-02,
          -1.0943e-01,  2.1172e+00],
         [ 9.0551e+02,  2.3500e+02,  9.0551e+02,  ..., -1.9046e-01,
           5.0617e-01,  2.4234e+00],
         [ 9.1141e+02,  2.3443e+02,  9.1054e+02,  ..., -5.4972e-02,
           2.4577e-01,  2.1373e+00]]])
Labels: tensor([2632])
Lengths: tensor([116])
Inputs shape: torch.Size([1, 116, 685])
Labels shape: torch.Size([1])
Batch 1:
Inputs: tensor([[[ 8.9625e+02,  2.1753e+02,  8.9678e+02,  ..., -5.8807e-03,
           4.9475e-02,  2.3519e+00],
         [ 8.9733e+02,  2.1875e+02,  8.9733e+02,  ..., -9.9854e-02,
           1.0286e-01,  2.1635e

  keypoints = [torch.tensor(k) for k in keypoints]


## 모델 선언 ##

In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len=5000):
        super().__init__()
        
        self.dropout = nn.Dropout(dropout_p)

        # 최대 길이에 대한 Positional Encoding 생성
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, ...
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model)

        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Positional Encoding을 모델의 버퍼로 등록
        pos_encoding = pos_encoding.unsqueeze(0)
        self.register_buffer("pos_encoding", pos_encoding)

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        seq_len = token_embedding.size(1)
        pos_encoding = self.pos_encoding[:, :seq_len, :]
        return self.dropout(token_embedding + pos_encoding)

In [21]:
# 모델 하이퍼파라미터 설정
num_keypoints = 138  # face, pose, hand_left, hand_right keypoints 필요없어 
input_dim = 685 # 각 키포인트의 2D 좌표(2)와 3D 좌표(3)를 사용
model_dim = 512  # 모델 차원
num_heads = 16  # 멀티헤드 어텐션의 헤드 수
num_layers = 8  # Transformer 레이어 수
num_classes = 2771 # 출력 클래스 수

class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, num_classes, max_len = 5000):
        super(TransformerModel, self).__init__()
        self.input_fc = nn.Linear(input_dim, model_dim)
        encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads,batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        # Positional Encoding 추가
        self.positional_encoding = PositionalEncoding(dim_model=model_dim, dropout_p=0.1, max_len=max_len)

        self.fc = nn.Linear(model_dim, num_classes)
    
    def forward(self, x, src_key_padding_mask):
        x = self.input_fc(x)
        # Positional Encoding 적용
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)  # transformer 대신 transformer_encoder 사용
        x = x.mean(dim=1)  # 시퀀스 차원 축소
        x = self.fc(x)
        return x
    
    

model = TransformerModel(input_dim, model_dim, num_heads, num_layers, num_classes)

In [15]:
# 손실 함수 및 옵티마이저 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
model.train()
min_loss = 11
num_epochs =3
num_batches_to_train = 3000  # 학습시킬 데이터 수
for epoch in range(num_epochs):  # 1 에포크만 학습
    batch_count = 0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        ll = loss.item()
        if ll < min_loss :
            min_loss = ll
        batch_count += 1
        if batch_count >= num_batches_to_train:
            break
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
print(min_loss)

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model.to(device)

min_loss = 100
num_epochs = 10
num_batches_to_train = 100

for epoch in range(num_epochs):
    model.train()
    batch_count = 0
    for inputs, labels, lengths in dataloader:
        
        mean = torch.mean(inputs)
        std = torch.std(inputs)
        inputs = (inputs - mean) / std
        
        inputs, labels = inputs.to(device), labels.to(device)  # 입력 데이터와 라벨을 GPU로 전송
        
        # 마스킹 생성
        src_key_padding_mask = (inputs.sum(dim=-1) == 0).to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs, src_key_padding_mask=src_key_padding_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        ll = loss.item()
        if ll < min_loss:
            min_loss = ll

        batch_count += inputs.size(0)
        if batch_count >= num_batches_to_train:
            break

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f'Minimum Loss: {min_loss:.4f}')

cuda


  keypoints = [torch.tensor(k) for k in keypoints]


Epoch [1/10], Loss: 8.1942
Epoch [2/10], Loss: 8.1991
Epoch [3/10], Loss: 8.2303
Epoch [4/10], Loss: 8.1868
Epoch [5/10], Loss: 8.1658
Epoch [6/10], Loss: 8.2344
Epoch [7/10], Loss: 8.1731
Epoch [8/10], Loss: 8.1985
Epoch [9/10], Loss: 8.2269
Epoch [10/10], Loss: 8.1965
Minimum Loss: 6.6264


In [23]:
def create_padding_mask(sequences, pad_token=0):
    return (sequences == pad_token)

# 모델을 평가 모드로 전환
model.eval()

# 평가 데이터를 로드 (학습 데이터와 동일한 데이터로 테스트하는 경우 예제)
#dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

# 평가 지표 초기화
correct = 0
total = 0

# 평가 모드에서 그라디언트 계산 비활성화
with torch.no_grad():
    for inputs, labels, lengths in dataloader:
        # 모델 출력 계산
        inputs, labels = inputs.to(device), labels.to(device)
        
        # 패딩 마스크 생성
        src_key_padding_mask = create_padding_mask(inputs[:,:,0])
        src_key_padding_mask = src_key_padding_mask.to(device)
        
        outputs = model(inputs, src_key_padding_mask=src_key_padding_mask)

        # 소프트맥스 함수로 확률로 변환
        probabilities = torch.nn.functional.softmax(outputs, dim=1)

        # 가장 높은 확률을 가진 클래스의 인덱스 구하기
        _, predicted_classes = torch.max(probabilities, 1)
        
        
        
        # 정확도 계산
        total += labels.size(0)
        correct += (predicted_classes == labels).sum().item()
        
        
        # 예측된 클래스 인덱스를 원래 라벨로 변환
        for predicted_class, label in zip(predicted_classes, labels):
            predicted_label = word_list[predicted_class.item()]
            # 출력
            print(f"Actual Label (encoded): {label.item()}")
            print(f"Predicted Label (encoded): {predicted_class.item()}")
            print(f"Predicted Label: {predicted_label}")
            print('-' * 30)
            
# 전체 정확도 출력
accuracy = 100 * correct / total
print(f'Accuracy of the model on the test data: {accuracy:.2f}%')
        

  keypoints = [torch.tensor(k) for k in keypoints]
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Actual Label (encoded): 2632
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 786
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 1994
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 2496
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 2089
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 493
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 2122
Predicted Label (encoded): 1408
Predicted Label: 을지로3가
------------------------------
Actual Label (encoded): 1383
Predicted Label (encoded): 2382
Predicted Label: 제법
------------------------------
Actual Label (encoded): 955
Predicted Label (encoded): 2382
Predicted Label: 제법
-----------------------

KeyboardInterrupt: 