In [7]:
import cv2
import mediapipe as mp
import numpy as np
import torch
import pickle
import torch.nn as nn

In [8]:
# input : mediapipe 손 keypoint (21,4)
# out : angle (15,)
def cal_angle(joint):
    # 벡터 계산
    v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3] # Parent joint
    v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3] # Child joint
    v = v2 - v1 # [20, 3]

    # normalize v : 길이로 나누기
    v = v/np.linalg.norm(v, axis=1)[:, np.newaxis]

    # arccos dot product로 앵글 구하기
    angle = np.arccos(np.einsum('nt, nt->n',
                                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18], :],
                                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

    # radian을 degree(도)로 변경
    angle = np.degrees(angle) 

    angle = np.array([angle], dtype=np.float32)
    return angle

In [9]:

def extract_keypoints(results):

    # 왼손 키포인트 추출
    if results.left_hand_landmarks:
        joint_left = np.zeros((21,4))
        # 키포인트 추출
        for j, lm in enumerate(results.left_hand_landmarks.landmark):
            joint_left[j] = [lm.x, lm.y, lm.z, lm.visibility]
        # 앵글 계산 [15,]
        angle_left = cal_angle(joint_left)
    else:
        joint_left = np.zeros((21,4))
        angle_left = [np.zeros((15,))]

    # 오른손 키포인트 추출
    if results.right_hand_landmarks:
        joint_right = np.zeros((21,4))
        # 키포인트 추출
        for j, lm in enumerate(results.right_hand_landmarks.landmark):
            joint_right[j] = [lm.x, lm.y, lm.z, lm.visibility]
        # 앵글 계산
        angle_right = cal_angle(joint_right)
    else:
        joint_right = np.zeros((21,4))
        angle_right = [np.zeros((15,))]
    

    frame_angle = np.concatenate([angle_left, angle_right])

    d = np.concatenate([joint_left.flatten(), joint_right.flatten(), frame_angle.flatten()])

    return d


In [10]:
# 모델 초기화 from train_transformer.ipynb

class Transformer(nn.Module):
    def __init__(self, num_angles, num_classes, seq_len=60, d_model=128, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_angles, d_model)# 각 프레임의 앵글 값을 d_model 차원으로 변환
        self.pos_encoder = nn.Parameter(torch.zeros(1, seq_len, d_model)) # 위치 인코딩
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout) 
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(d_model, num_classes) 

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder # B, T, d_model
        x = self.transformer_encoder(x) # B,T, d_model
        x = x.mean(dim=1) # 전체 시퀀스에 대한 평균 (B, d_model)
        return self.fc(x)  # (B, num_classes)
    

In [11]:
### label - idx mapping정보 가져오기
import pickle
with open('../data/label_to_idx.pickle', 'rb') as f:
    label_to_idx = pickle.load(f)
print(label_to_idx)
idx_to_label = {value : key for key, value in label_to_idx.items()} ## idx로 label접근


num_angles = 198
num_classes = len(label_to_idx)
frame = 60

{'0': 0, '1': 1, '10': 2, '100': 3, '1000': 4, '10000': 5, '11': 6, '112': 7, '119': 8, '12': 9, '13': 10, '14': 11, '15': 12, '16': 13, '17': 14, '18': 15, '19': 16, '2': 17, '20': 18, '21': 19, '22': 20, '23': 21, '24': 22, '25': 23, '26': 24, '27': 25, '28': 26, '29': 27, '3': 28, '30': 29, '31': 30, '32': 31, '33': 32, '34': 33, '35': 34, '36': 35, '37': 36, '38': 37, '39': 38, '4': 39, '40': 40, '41': 41, '42': 42, '43': 43, '44': 44, '45': 45, '46': 46, '47': 47, '48': 48, '49': 49, '5': 50, '50': 51, '51': 52, '52': 53, '53': 54, '54': 55, '55': 56, '56': 57, '57': 58, '58': 59, '59': 60, '6': 61, '60': 62, '61': 63, '62': 64, '63': 65, '64': 66, '65': 67, '66': 68, '67': 69, '68': 70, '69': 71, '7': 72, '70': 73, '71': 74, '72': 75, '73': 76, '74': 77, '75': 78, '76': 79, '77': 80, '78': 81, '79': 82, '8': 83, '80': 84, '81': 85, '82': 86, '83': 87, '84': 88, '85': 89, '86': 90, '87': 91, '88': 92, '89': 93, '9': 94, '90': 95, '91': 96, '92': 97, '93': 98, '94': 99, '95': 100, 

In [None]:
# 비디오
cap = cv2.VideoCapture(0)

# 웹캠 프레임 크기를 정사각형으로 설정
frame_size = 640
cap.set(cv2.CAP_PROP_FRAME_WIDTH, frame_size)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_size)

# holistic설정
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()
mp_draw = mp.solutions.drawing_utils
mp_draw_styles = mp.solutions.drawing_styles

### 모델 가져오기 ###
graph_args = {"layout": "mediapipe", "strategy": "spatial"}
loaded_model = Transformer(num_angles=num_angles, num_classes=num_classes)
loaded_model.load_state_dict(torch.load("../model/transformer_60fps.pth", map_location=torch.device('cpu')))

keypoint_sequence = []


sentence = [" ", ]

# 이미지 입력 캡처 및 처리
# media pipe 는 RGB
while cap.isOpened():
    success, image = cap.read()
    imageRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = holistic.process(imageRGB)

    # print("왼손 랜드마크: ", results.left_hand_landmarks)
    # print("오른손 랜드마크: ", results.right_hand_landmarks)
    # print("얼굴 랜드마크: ", results.face_landmarks)
    # print("pose 랜드마크: ", results.pose_landmarks)

    keypoints = extract_keypoints(results)
    keypoint_sequence.append(keypoints)
    sequence = keypoint_sequence[-frame:]  # 마지막  frame으로 prediction 한다


    if len(sequence) == frame:  # 60 프레임

        output = loaded_model(torch.tensor(sequence, dtype=torch.float32))
        prediction = torch.argmax(output, dim=1)
        prediction_value = prediction.item()

        if output[0, prediction_value]>30:
            if idx_to_label[prediction_value] != sentence[-1]:
                sentence.append(idx_to_label[prediction_value])

                print('prediction ', prediction_value, ':', idx_to_label[prediction_value])
                print('acc ', output[0, prediction_value]) # (batch, class)




    # 점 그리기
    annotated_image = image.copy()
    mp_draw.draw_landmarks(annotated_image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_draw.draw_landmarks(annotated_image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


    cv2.imshow('output', annotated_image)
    cv2.waitKey(1)



cap.release()
holistic.close()



I0000 00:00:1742795460.282089 5948793 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2 Max
W0000 00:00:1742795460.355451 5952195 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742795460.366601 5952195 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742795460.368907 5952192 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742795460.368950 5952198 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742795460.369132 5952200 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling supp

KeyboardInterrupt: 

: 