# 사진 지문자 인식 모델 (Full Body)

**목표**: 사진/실시간 카메라로 지문자(ㄱ,ㄴ,ㄷ,...,ㅏ,ㅓ,ㅗ,...) 인식

**키포인트 구조 (411차원 - 기존 영상 모델과 동일)**:
- Pose: 25 landmarks * 3 = 75
- Face: 70 landmarks * 3 = 210
- Left Hand: 21 landmarks * 3 = 63
- Right Hand: 21 landmarks * 3 = 63
- **Total: 411 차원**

**모델 구조**:
1. MediaPipe 키포인트 추출 (Pose + Face + Hands) + MLP 분류
2. CNN 이미지 분류 (EfficientNet)
3. 앙상블 (두 모델 결합)
4. 실시간 카메라 인식

**데이터**: `/Users/garyeong/project-1/사진_지문자/` (파일명 = 라벨)

In [None]:
# Cell 1: 라이브러리 설치 및 임포트
# !pip install mediapipe opencv-python torch torchvision timm pillow

import os
import cv2
import numpy as np
import mediapipe as mp
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
from collections import Counter
import glob
import warnings
warnings.filterwarnings('ignore')

# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"사용 디바이스: {device}")

# 경로 설정
DATA_DIR = "/Users/garyeong/project-1/사진_지문자"
MODEL_DIR = "/Users/garyeong/project-1/morpheme/photo_model"

print(f"데이터 경로: {DATA_DIR}")
print(f"모델 저장 경로: {MODEL_DIR}")

# MediaPipe 초기화
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# 키포인트 차원 설정 (기존 영상 모델과 동일)
POSE_DIM = 75      # 25 landmarks * 3
FACE_DIM = 210     # 70 landmarks * 3
HAND_DIM = 63      # 21 landmarks * 3
TOTAL_DIM = POSE_DIM + FACE_DIM + HAND_DIM * 2  # 411

print(f"\n키포인트 차원: {TOTAL_DIM} (Pose:{POSE_DIM} + Face:{FACE_DIM} + LHand:{HAND_DIM} + RHand:{HAND_DIM})")

In [None]:
# Cell 2: 데이터 로드 및 확인

# 이미지 파일 목록
image_files = glob.glob(os.path.join(DATA_DIR, "*.png"))
image_files.extend(glob.glob(os.path.join(DATA_DIR, "*.jpg")))
image_files.extend(glob.glob(os.path.join(DATA_DIR, "*.jpeg")))
image_files = sorted(image_files)

print(f"발견된 이미지: {len(image_files)}개")

# 라벨 추출 (파일명 = 라벨)
labels = []
for img_path in image_files:
    filename = os.path.basename(img_path)
    label = os.path.splitext(filename)[0]  # 확장자 제거
    labels.append(label)

# 라벨 통계
unique_labels = sorted(set(labels))
print(f"\n고유 라벨 수: {len(unique_labels)}개")
print(f"라벨 목록: {unique_labels}")

# 라벨 → 인덱스 매핑
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
num_classes = len(unique_labels)

print(f"\n클래스 수: {num_classes}")
print(f"라벨 매핑: {label_to_idx}")

In [None]:
# Cell 3: Full Body 키포인트 추출 함수 (Pose + Face + Hands = 411차원)

def extract_full_keypoints(image, pose, hands, face_mesh):
    """
    이미지에서 전체 키포인트 추출 (기존 영상 모델과 동일한 411차원 구조)
    
    Returns:
        keypoints: (411,) - Pose(75) + Face(210) + LHand(63) + RHand(63)
        detection_info: dict - 각 부위 감지 성공 여부
    """
    # BGR -> RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # MediaPipe 처리
    pose_results = pose.process(rgb_image)
    hands_results = hands.process(rgb_image)
    face_results = face_mesh.process(rgb_image)
    
    detection_info = {'pose': False, 'face': False, 'left_hand': False, 'right_hand': False}
    
    # 1. Pose keypoints (25 landmarks * 3 = 75)
    pose_kps = []
    if pose_results.pose_landmarks:
        detection_info['pose'] = True
        for i in range(25):  # 처음 25개 랜드마크만 사용
            if i < len(pose_results.pose_landmarks.landmark):
                lm = pose_results.pose_landmarks.landmark[i]
                pose_kps.extend([lm.x, lm.y, lm.visibility])
            else:
                pose_kps.extend([0.0, 0.0, 0.0])
    else:
        pose_kps = [0.0] * POSE_DIM
    
    # 2. Face keypoints (70 landmarks * 3 = 210)
    face_kps = []
    if face_results.multi_face_landmarks:
        detection_info['face'] = True
        face_landmarks = face_results.multi_face_landmarks[0]
        for i in range(70):  # 처음 70개 랜드마크만 사용
            if i < len(face_landmarks.landmark):
                lm = face_landmarks.landmark[i]
                face_kps.extend([lm.x, lm.y, lm.z])
            else:
                face_kps.extend([0.0, 0.0, 0.0])
    else:
        face_kps = [0.0] * FACE_DIM
    
    # 3. Hand keypoints (21 * 3 * 2 = 126)
    left_hand_kps = [0.0] * HAND_DIM
    right_hand_kps = [0.0] * HAND_DIM
    
    if hands_results.multi_hand_landmarks:
        for idx, hand_landmarks in enumerate(hands_results.multi_hand_landmarks):
            handedness = hands_results.multi_handedness[idx].classification[0].label
            hand_kps = []
            for lm in hand_landmarks.landmark:
                hand_kps.extend([lm.x, lm.y, lm.z])
            
            # 미러링 고려: MediaPipe는 카메라 기준이므로 반대
            if handedness == "Left":
                left_hand_kps = hand_kps
                detection_info['left_hand'] = True
            elif handedness == "Right":
                right_hand_kps = hand_kps
                detection_info['right_hand'] = True
    
    # 전체 키포인트 결합 (411차원)
    combined_kps = np.concatenate([
        np.array(pose_kps),
        np.array(face_kps),
        np.array(left_hand_kps),
        np.array(right_hand_kps)
    ])
    
    return combined_kps.astype(np.float32), detection_info

print("=== Full Body 키포인트 추출 함수 정의 완료 ===")
print(f"출력 차원: {TOTAL_DIM} (Pose:{POSE_DIM} + Face:{FACE_DIM} + LHand:{HAND_DIM} + RHand:{HAND_DIM})")

In [None]:
# Cell 4: 모든 이미지에서 키포인트 추출

print("=== 이미지에서 키포인트 추출 시작 ===")

all_keypoints = []
all_labels_idx = []
all_images_for_cnn = []
detection_stats = {'pose': 0, 'face': 0, 'left_hand': 0, 'right_hand': 0}

# MediaPipe 초기화
with mp_pose.Pose(
    static_image_mode=True,
    model_complexity=2,
    enable_segmentation=False,
    min_detection_confidence=0.5
) as pose, mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=2,
    model_complexity=1,
    min_detection_confidence=0.5
) as hands, mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
) as face_mesh:
    
    for idx, img_path in enumerate(image_files):
        # 이미지 로드
        image = cv2.imread(img_path)
        if image is None:
            print(f"  [SKIP] 로드 실패: {img_path}")
            continue
        
        # 키포인트 추출
        keypoints, detection_info = extract_full_keypoints(image, pose, hands, face_mesh)
        
        # 라벨 인덱스
        filename = os.path.basename(img_path)
        label = os.path.splitext(filename)[0]
        label_idx = label_to_idx[label]
        
        # 저장
        all_keypoints.append(keypoints)
        all_labels_idx.append(label_idx)
        all_images_for_cnn.append(img_path)
        
        # 통계 업데이트
        for key in detection_stats:
            if detection_info.get(key, False):
                detection_stats[key] += 1
        
        # 진행 상황
        if (idx + 1) % 10 == 0 or idx == len(image_files) - 1:
            print(f"  진행: {idx + 1}/{len(image_files)}")

# numpy 배열로 변환
all_keypoints = np.array(all_keypoints, dtype=np.float32)

print(f"\n=== 키포인트 추출 완료 ===")
print(f"총 샘플: {len(all_keypoints)}개")
print(f"키포인트 shape: {all_keypoints.shape}")
print(f"\n감지 통계:")
print(f"  - Pose: {detection_stats['pose']}/{len(all_keypoints)} ({100*detection_stats['pose']/len(all_keypoints):.1f}%)")
print(f"  - Face: {detection_stats['face']}/{len(all_keypoints)} ({100*detection_stats['face']/len(all_keypoints):.1f}%)")
print(f"  - Left Hand: {detection_stats['left_hand']}/{len(all_keypoints)} ({100*detection_stats['left_hand']/len(all_keypoints):.1f}%)")
print(f"  - Right Hand: {detection_stats['right_hand']}/{len(all_keypoints)} ({100*detection_stats['right_hand']/len(all_keypoints):.1f}%)")

In [None]:
# Cell 5: 데이터 증강 및 가중치 적용 (손에 0.8 가중치)

# 부위별 가중치 설정
POSE_WEIGHT = 0.1   # 몸
FACE_WEIGHT = 0.1   # 얼굴
HAND_WEIGHT = 0.8   # 손 (중요!)

print(f"=== 부위별 가중치 ===")
print(f"  - Pose (몸): {POSE_WEIGHT}")
print(f"  - Face (얼굴): {FACE_WEIGHT}")
print(f"  - Hands (손): {HAND_WEIGHT}")

def apply_body_part_weights(keypoints):
    """
    각 부위별 가중치 적용
    - Pose: 0.1
    - Face: 0.1  
    - Hands: 0.8
    """
    weighted_kp = keypoints.copy()
    
    # 부위별 인덱스 범위
    pose_start, pose_end = 0, POSE_DIM
    face_start, face_end = POSE_DIM, POSE_DIM + FACE_DIM
    lhand_start, lhand_end = POSE_DIM + FACE_DIM, POSE_DIM + FACE_DIM + HAND_DIM
    rhand_start, rhand_end = POSE_DIM + FACE_DIM + HAND_DIM, TOTAL_DIM
    
    # 가중치 적용
    weighted_kp[pose_start:pose_end] *= POSE_WEIGHT
    weighted_kp[face_start:face_end] *= FACE_WEIGHT
    weighted_kp[lhand_start:lhand_end] *= HAND_WEIGHT
    weighted_kp[rhand_start:rhand_end] *= HAND_WEIGHT
    
    return weighted_kp

def augment_keypoints_411(keypoints, num_augment=20):
    """
    411차원 키포인트 데이터 증강
    - 노이즈 추가
    - 스케일 변화
    - 2D 회전
    """
    augmented = [keypoints.copy()]
    
    # 각 부위별 인덱스
    pose_end = POSE_DIM
    face_end = pose_end + FACE_DIM
    lhand_end = face_end + HAND_DIM
    rhand_end = lhand_end + HAND_DIM
    
    for _ in range(num_augment):
        kp = keypoints.copy()
        
        # 1. 노이즈 추가 (각 부위별로)
        noise = np.random.normal(0, 0.015, kp.shape)
        kp += noise
        
        # 2. 스케일 변화 (전체)
        scale = np.random.uniform(0.95, 1.05)
        
        # x, y 좌표에만 스케일 적용 (각 landmark마다 x, y, z/conf 순서)
        for i in range(0, len(kp), 3):
            if i + 1 < len(kp):
                # x, y 좌표 스케일
                center_x = 0.5
                center_y = 0.5
                kp[i] = (kp[i] - center_x) * scale + center_x
                kp[i+1] = (kp[i+1] - center_y) * scale + center_y
        
        # 3. 2D 회전 (작은 각도)
        angle = np.random.uniform(-10, 10) * np.pi / 180
        cos_a, sin_a = np.cos(angle), np.sin(angle)
        
        for i in range(0, len(kp), 3):
            if i + 1 < len(kp):
                x, y = kp[i] - 0.5, kp[i+1] - 0.5
                kp[i] = x * cos_a - y * sin_a + 0.5
                kp[i+1] = x * sin_a + y * cos_a + 0.5
        
        augmented.append(kp)
    
    return np.array(augmented, dtype=np.float32)

# 데이터 증강 적용
print("\n=== 데이터 증강 시작 (411차원) ===")
NUM_AUGMENT = 30  # 각 샘플당 30개 증강 (데이터가 적으므로)

augmented_keypoints = []
augmented_labels = []
augmented_image_paths = []

for i in range(len(all_keypoints)):
    kp = all_keypoints[i]
    label_idx = all_labels_idx[i]
    img_path = all_images_for_cnn[i]
    
    # 1. 가중치 적용 (손에 0.8 가중치)
    weighted_kp = apply_body_part_weights(kp)
    
    # 2. 원본 + 증강
    aug_kps = augment_keypoints_411(weighted_kp, NUM_AUGMENT)
    
    for aug_kp in aug_kps:
        augmented_keypoints.append(aug_kp)
        augmented_labels.append(label_idx)
        augmented_image_paths.append(img_path)

# 텐서 변환
X_keypoints_aug = torch.tensor(np.array(augmented_keypoints), dtype=torch.float32)
Y_labels_aug = torch.tensor(augmented_labels, dtype=torch.long)

print(f"증강 완료!")
print(f"  - 원본: {len(all_keypoints)}개")
print(f"  - 증강 후: {len(augmented_keypoints)}개")
print(f"  - 증강 배율: {len(augmented_keypoints) // len(all_keypoints)}x")
print(f"\nX_keypoints_aug shape: {X_keypoints_aug.shape}")
print(f"Y_labels_aug shape: {Y_labels_aug.shape}")
print(f"\n손 가중치 0.8 적용됨!")

In [None]:
# Cell 6: 키포인트 정규화

# 정규화 (평균 0, 표준편차 1)
kp_mean = X_keypoints_aug.mean(dim=0, keepdim=True)
kp_std = X_keypoints_aug.std(dim=0, keepdim=True) + 1e-8

X_keypoints_norm = (X_keypoints_aug - kp_mean) / kp_std

print("=== 키포인트 정규화 완료 (411차원) ===")
print(f"정규화 전 - Mean: {X_keypoints_aug.mean():.4f}, Std: {X_keypoints_aug.std():.4f}")
print(f"정규화 후 - Mean: {X_keypoints_norm.mean():.4f}, Std: {X_keypoints_norm.std():.4f}")

# 정규화 파라미터 저장 (추론 시 필요)
norm_params = {
    'mean': kp_mean,
    'std': kp_std
}

print(f"\n정규화 파라미터 저장됨 (추론 시 사용)")

In [None]:
# Cell 7: 키포인트 분류 모델 (MLP - 411차원 입력)

class KeypointClassifier(nn.Module):
    """
    Full Body 키포인트 기반 지문자 분류 모델
    Input: (batch, 411) - Pose(75) + Face(210) + LHand(63) + RHand(63)
    Output: (batch, num_classes)
    """
    def __init__(self, input_size=411, num_classes=28, dropout=0.3):
        super(KeypointClassifier, self).__init__()
        
        self.model = nn.Sequential(
            # Layer 1: 411 -> 512
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            # Layer 2: 512 -> 256
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            # Layer 3: 256 -> 128
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            # Layer 4: 128 -> 64
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            # Output: 64 -> num_classes
            nn.Linear(64, num_classes)
        )
    
    def forward(self, x):
        return self.model(x)
    
    def get_features(self, x):
        """앙상블을 위한 중간 특징 추출 (마지막 Linear 전)"""
        for layer in list(self.model.children())[:-1]:
            x = layer(x)
        return x

# 모델 생성
model_kp = KeypointClassifier(input_size=TOTAL_DIM, num_classes=num_classes, dropout=0.3)
model_kp = model_kp.to(device)

print("=== 키포인트 분류 모델 (411차원 입력) ===")
print(model_kp)
print(f"\n입력 차원: {TOTAL_DIM}")
print(f"출력 클래스: {num_classes}")
print(f"파라미터 수: {sum(p.numel() for p in model_kp.parameters() if p.requires_grad):,}")

In [None]:
# Cell 8: 키포인트 모델 학습
from torch.utils.data import TensorDataset

# 데이터셋 생성
dataset_kp = TensorDataset(X_keypoints_norm, Y_labels_aug)

# Train/Val 분할 (80/20)
train_size = int(0.8 * len(dataset_kp))
val_size = len(dataset_kp) - train_size
train_dataset_kp, val_dataset_kp = random_split(dataset_kp, [train_size, val_size])

# DataLoader
batch_size = 32
train_loader_kp = DataLoader(train_dataset_kp, batch_size=batch_size, shuffle=True)
val_loader_kp = DataLoader(val_dataset_kp, batch_size=batch_size, shuffle=False)

print(f"=== 데이터셋 분할 ===")
print(f"  - 학습: {len(train_dataset_kp)}개")
print(f"  - 검증: {len(val_dataset_kp)}개")

# 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer_kp = optim.Adam(model_kp.parameters(), lr=0.001, weight_decay=1e-4)
scheduler_kp = optim.lr_scheduler.ReduceLROnPlateau(optimizer_kp, mode='min', factor=0.5, patience=10)

# 학습
num_epochs = 100
best_val_acc = 0

print(f"\n=== 키포인트 모델 학습 시작 (411차원) ===")
print(f"Epochs: {num_epochs}")

for epoch in range(num_epochs):
    # 학습
    model_kp.train()
    train_loss = 0
    train_correct = 0
    
    for batch_x, batch_y in train_loader_kp:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer_kp.zero_grad()
        outputs = model_kp(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer_kp.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == batch_y).sum().item()
    
    train_loss /= len(train_loader_kp)
    train_acc = 100 * train_correct / len(train_dataset_kp)
    
    # 검증
    model_kp.eval()
    val_loss = 0
    val_correct = 0
    
    with torch.no_grad():
        for batch_x, batch_y in val_loader_kp:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model_kp(batch_x)
            loss = criterion(outputs, batch_y)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == batch_y).sum().item()
    
    val_loss /= len(val_loader_kp)
    val_acc = 100 * val_correct / len(val_dataset_kp)
    
    scheduler_kp.step(val_loss)
    
    # 출력
    if (epoch + 1) % 10 == 0 or val_acc > best_val_acc:
        print(f"Epoch {epoch+1:3d}/{num_epochs} | "
              f"Loss: {train_loss:.4f}/{val_loss:.4f} | "
              f"Acc: {train_acc:.1f}%/{val_acc:.1f}%", end="")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model_kp.state_dict(), os.path.join(MODEL_DIR, 'model_keypoint_411_best.pth'))
            print(f" <- BEST")
        else:
            print()

print(f"\n=== 학습 완료 ===")
print(f"Best Validation Accuracy: {best_val_acc:.2f}%")

In [None]:
# Cell 9: CNN 이미지 분류 모델 (EfficientNet)
import timm

class ImageClassifier(nn.Module):
    """
    이미지 기반 지문자 분류 모델 (EfficientNet-B0)
    """
    def __init__(self, num_classes=28, pretrained=True):
        super(ImageClassifier, self).__init__()
        
        # EfficientNet-B0 백본
        self.backbone = timm.create_model('efficientnet_b0', pretrained=pretrained, num_classes=0)
        
        # 분류 헤드
        self.classifier = nn.Sequential(
            nn.Linear(1280, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        features = self.backbone(x)
        return self.classifier(features)
    
    def get_features(self, x):
        """앙상블을 위한 특징 추출"""
        return self.backbone(x)

# 모델 생성
model_cnn = ImageClassifier(num_classes=num_classes, pretrained=True)
model_cnn = model_cnn.to(device)

print("=== CNN 이미지 분류 모델 ===")
print(f"백본: EfficientNet-B0")
print(f"파라미터 수: {sum(p.numel() for p in model_cnn.parameters() if p.requires_grad):,}")

In [None]:
# Cell 10: CNN 데이터셋 정의 및 학습

class FingerAlphabetImageDataset(Dataset):
    """
    지문자 이미지 데이터셋 (CNN용)
    """
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        return image, label

# 이미지 전처리
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 원본 데이터만 사용
dataset_cnn = FingerAlphabetImageDataset(all_images_for_cnn, all_labels_idx, transform=train_transform)

# Train/Val 분할
train_size = int(0.8 * len(dataset_cnn))
val_size = len(dataset_cnn) - train_size
train_dataset_cnn, val_dataset_cnn = random_split(dataset_cnn, [train_size, val_size])

# DataLoader
train_loader_cnn = DataLoader(train_dataset_cnn, batch_size=8, shuffle=True)
val_loader_cnn = DataLoader(val_dataset_cnn, batch_size=8, shuffle=False)

print(f"=== CNN 데이터셋 ===")
print(f"  - 전체: {len(dataset_cnn)}개")
print(f"  - 학습: {len(train_dataset_cnn)}개")
print(f"  - 검증: {len(val_dataset_cnn)}개")

# 손실 함수 및 옵티마이저
criterion_cnn = nn.CrossEntropyLoss()
optimizer_cnn = optim.Adam(model_cnn.parameters(), lr=0.0001, weight_decay=1e-4)
scheduler_cnn = optim.lr_scheduler.ReduceLROnPlateau(optimizer_cnn, mode='min', factor=0.5, patience=5)

# 학습
num_epochs_cnn = 50
best_val_acc_cnn = 0

print(f"\n=== CNN 모델 학습 시작 ===")
print(f"Epochs: {num_epochs_cnn}")

for epoch in range(num_epochs_cnn):
    # 학습
    model_cnn.train()
    train_loss = 0
    train_correct = 0
    
    for batch_x, batch_y in train_loader_cnn:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer_cnn.zero_grad()
        outputs = model_cnn(batch_x)
        loss = criterion_cnn(outputs, batch_y)
        loss.backward()
        optimizer_cnn.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == batch_y).sum().item()
    
    train_loss /= len(train_loader_cnn)
    train_acc = 100 * train_correct / len(train_dataset_cnn)
    
    # 검증
    model_cnn.eval()
    val_loss = 0
    val_correct = 0
    
    with torch.no_grad():
        for batch_x, batch_y in val_loader_cnn:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model_cnn(batch_x)
            loss = criterion_cnn(outputs, batch_y)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == batch_y).sum().item()
    
    val_loss /= len(val_loader_cnn)
    val_acc = 100 * val_correct / len(val_dataset_cnn)
    
    scheduler_cnn.step(val_loss)
    
    # 출력
    if (epoch + 1) % 5 == 0 or val_acc > best_val_acc_cnn:
        print(f"Epoch {epoch+1:3d}/{num_epochs_cnn} | "
              f"Loss: {train_loss:.4f}/{val_loss:.4f} | "
              f"Acc: {train_acc:.1f}%/{val_acc:.1f}%", end="")
        
        if val_acc > best_val_acc_cnn:
            best_val_acc_cnn = val_acc
            torch.save(model_cnn.state_dict(), os.path.join(MODEL_DIR, 'model_cnn_best.pth'))
            print(f" <- BEST")
        else:
            print()

print(f"\n=== CNN 학습 완료 ===")
print(f"Best Validation Accuracy: {best_val_acc_cnn:.2f}%")

In [None]:
# Cell 11: 앙상블 모델 정의

class EnsembleClassifier(nn.Module):
    """
    키포인트(411차원) + CNN 앙상블 모델
    """
    def __init__(self, model_kp, model_cnn, num_classes, kp_weight=0.4, cnn_weight=0.6):
        super(EnsembleClassifier, self).__init__()
        self.model_kp = model_kp
        self.model_cnn = model_cnn
        self.kp_weight = kp_weight
        self.cnn_weight = cnn_weight
    
    def forward(self, keypoints, image):
        """
        Args:
            keypoints: (batch, 411) - 정규화된 키포인트
            image: (batch, 3, 224, 224) - 정규화된 이미지
        Returns:
            ensemble_probs: (batch, num_classes) - 앙상블 확률
        """
        # 각 모델의 예측
        logits_kp = self.model_kp(keypoints)
        logits_cnn = self.model_cnn(image)
        
        # Softmax 확률로 변환
        probs_kp = torch.softmax(logits_kp, dim=1)
        probs_cnn = torch.softmax(logits_cnn, dim=1)
        
        # 가중 평균 앙상블
        ensemble_probs = self.kp_weight * probs_kp + self.cnn_weight * probs_cnn
        
        return ensemble_probs

# Best 모델 로드
model_kp.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'model_keypoint_411_best.pth'), map_location=device))
model_cnn.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'model_cnn_best.pth'), map_location=device))

# 앙상블 모델 생성
model_ensemble = EnsembleClassifier(model_kp, model_cnn, num_classes)
model_ensemble = model_ensemble.to(device)
model_ensemble.eval()

print("=== 앙상블 모델 생성 완료 ===")
print(f"키포인트 모델 가중치: {model_ensemble.kp_weight}")
print(f"CNN 모델 가중치: {model_ensemble.cnn_weight}")
print(f"\n키포인트 입력: 411차원 (Pose + Face + Hands)")

In [None]:
# Cell 12: 실시간 인식 함수 정의 (Full Body + 손 가중치 0.8)

class RealTimeFingerAlphabetRecognizer:
    """
    실시간 지문자 인식기 (Pose + Face + Hands = 411차원)
    손에 0.8 가중치 적용
    """
    def __init__(self, model_kp, model_cnn, model_ensemble, norm_params, label_to_idx, idx_to_label, device):
        self.model_kp = model_kp
        self.model_cnn = model_cnn
        self.model_ensemble = model_ensemble
        self.norm_params = norm_params
        self.label_to_idx = label_to_idx
        self.idx_to_label = idx_to_label
        self.device = device
        
        # MediaPipe 초기화 (Pose + Face + Hands)
        self.pose = mp_pose.Pose(
            static_image_mode=False,
            model_complexity=1,
            enable_segmentation=False,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.hands = mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=2,
            model_complexity=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.face_mesh = mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # 이미지 전처리
        self.image_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        # 예측 안정화를 위한 히스토리
        self.prediction_history = []
        self.history_size = 5
    
    def extract_keypoints(self, frame):
        """프레임에서 Full Body 키포인트 추출 (411차원) + 손 가중치 0.8 적용"""
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # MediaPipe 처리
        pose_results = self.pose.process(rgb_frame)
        hands_results = self.hands.process(rgb_frame)
        face_results = self.face_mesh.process(rgb_frame)
        
        landmarks_info = {
            'pose': pose_results.pose_landmarks,
            'hands': hands_results.multi_hand_landmarks,
            'handedness': hands_results.multi_handedness,
            'face': face_results.multi_face_landmarks
        }
        
        # 1. Pose keypoints (25 * 3 = 75)
        pose_kps = []
        if pose_results.pose_landmarks:
            for i in range(25):
                if i < len(pose_results.pose_landmarks.landmark):
                    lm = pose_results.pose_landmarks.landmark[i]
                    pose_kps.extend([lm.x, lm.y, lm.visibility])
                else:
                    pose_kps.extend([0.0, 0.0, 0.0])
        else:
            pose_kps = [0.0] * POSE_DIM
        
        # 2. Face keypoints (70 * 3 = 210)
        face_kps = []
        if face_results.multi_face_landmarks:
            face_landmarks = face_results.multi_face_landmarks[0]
            for i in range(70):
                if i < len(face_landmarks.landmark):
                    lm = face_landmarks.landmark[i]
                    face_kps.extend([lm.x, lm.y, lm.z])
                else:
                    face_kps.extend([0.0, 0.0, 0.0])
        else:
            face_kps = [0.0] * FACE_DIM
        
        # 3. Hand keypoints (21 * 3 * 2 = 126)
        left_hand_kps = [0.0] * HAND_DIM
        right_hand_kps = [0.0] * HAND_DIM
        
        if hands_results.multi_hand_landmarks:
            for idx, hand_landmarks in enumerate(hands_results.multi_hand_landmarks):
                handedness = hands_results.multi_handedness[idx].classification[0].label
                hand_kps = []
                for lm in hand_landmarks.landmark:
                    hand_kps.extend([lm.x, lm.y, lm.z])
                
                if handedness == "Left":
                    left_hand_kps = hand_kps
                elif handedness == "Right":
                    right_hand_kps = hand_kps
        
        # 전체 키포인트 결합 (411차원)
        combined_kps = np.concatenate([
            np.array(pose_kps),
            np.array(face_kps),
            np.array(left_hand_kps),
            np.array(right_hand_kps)
        ]).astype(np.float32)
        
        # 부위별 가중치 적용 (손에 0.8 가중치)
        weighted_kps = apply_body_part_weights(combined_kps)
        
        return weighted_kps, landmarks_info
    
    def predict(self, frame, use_ensemble=True):
        """
        단일 프레임에서 지문자 예측
        """
        # 키포인트 추출 (가중치 적용됨)
        keypoints, landmarks_info = self.extract_keypoints(frame)
        
        # 손이 감지되지 않으면 예측 불가
        if landmarks_info['hands'] is None:
            return None, 0.0, landmarks_info
        
        # 키포인트 정규화
        kp_tensor = torch.tensor(keypoints, dtype=torch.float32).unsqueeze(0)
        kp_norm = (kp_tensor - self.norm_params['mean']) / self.norm_params['std']
        kp_norm = kp_norm.to(self.device)
        
        if use_ensemble:
            # 이미지 전처리
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image_tensor = self.image_transform(rgb_frame).unsqueeze(0).to(self.device)
            
            # 앙상블 예측
            with torch.no_grad():
                probs = self.model_ensemble(kp_norm, image_tensor)
        else:
            # 키포인트만 사용
            with torch.no_grad():
                logits = self.model_kp(kp_norm)
                probs = torch.softmax(logits, dim=1)
        
        # 예측 결과
        confidence, predicted_idx = torch.max(probs, dim=1)
        predicted_label = self.idx_to_label[predicted_idx.item()]
        confidence = confidence.item()
        
        # 히스토리 기반 안정화
        self.prediction_history.append(predicted_label)
        if len(self.prediction_history) > self.history_size:
            self.prediction_history.pop(0)
        
        if len(self.prediction_history) >= 3:
            most_common = Counter(self.prediction_history).most_common(1)[0][0]
            predicted_label = most_common
        
        return predicted_label, confidence, landmarks_info
    
    def draw_results(self, frame, predicted_label, confidence, landmarks_info):
        """결과 시각화 (Pose + Face + Hands)"""
        # Pose 그리기
        if landmarks_info['pose']:
            mp_drawing.draw_landmarks(
                frame, landmarks_info['pose'], mp_pose.POSE_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 255, 255), thickness=2, circle_radius=2),
                mp_drawing.DrawingSpec(color=(255, 255, 0), thickness=2)
            )
        
        # Face 그리기 (간소화된 테두리)
        if landmarks_info['face']:
            for face_landmarks in landmarks_info['face']:
                mp_drawing.draw_landmarks(
                    frame, face_landmarks,
                    mp_face_mesh.FACEMESH_CONTOURS,
                    landmark_drawing_spec=None,
                    connection_drawing_spec=mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=1)
                )
        
        # Hands 그리기
        if landmarks_info['hands']:
            for hand_landmarks in landmarks_info['hands']:
                mp_drawing.draw_landmarks(
                    frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=2, circle_radius=3),
                    mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
                )
        
        # 예측 결과 표시
        if predicted_label:
            text = f"{predicted_label} ({confidence*100:.1f}%)"
            cv2.putText(frame, text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                       1.5, (0, 255, 0), 3)
        else:
            cv2.putText(frame, "No hand detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
                       1, (0, 0, 255), 2)
        
        return frame
    
    def release(self):
        """리소스 해제"""
        self.pose.close()
        self.hands.close()
        self.face_mesh.close()

print("=== 실시간 인식 클래스 정의 완료 (Full Body) ===")
print(f"키포인트: 411차원 (Pose + Face + Hands)")
print(f"손 가중치: {HAND_WEIGHT} (Pose/Face: {POSE_WEIGHT}/{FACE_WEIGHT})")

In [None]:
# Cell 14: 단일 이미지 테스트

def predict_single_image(image_path, use_ensemble=True):
    """
    단일 이미지에서 지문자 예측 (Full Body)
    """
    # 이미지 로드
    frame = cv2.imread(image_path)
    if frame is None:
        print(f"이미지를 로드할 수 없습니다: {image_path}")
        return None
    
    # 인식기 초기화
    recognizer = RealTimeFingerAlphabetRecognizer(
        model_kp=model_kp,
        model_cnn=model_cnn,
        model_ensemble=model_ensemble,
        norm_params=norm_params,
        label_to_idx=label_to_idx,
        idx_to_label=idx_to_label,
        device=device
    )
    
    # 예측
    predicted_label, confidence, landmarks_info = recognizer.predict(frame, use_ensemble)
    
    # 결과 출력
    true_label = os.path.splitext(os.path.basename(image_path))[0]
    print(f"\n=== 예측 결과 (411차원) ===")
    print(f"파일: {os.path.basename(image_path)}")
    print(f"정답: {true_label}")
    print(f"예측: {predicted_label}")
    print(f"신뢰도: {confidence*100:.1f}%")
    print(f"결과: {'정답' if true_label == predicted_label else '오답'}")
    print(f"\n감지 정보:")
    print(f"  - Pose: {'O' if landmarks_info['pose'] else 'X'}")
    print(f"  - Face: {'O' if landmarks_info['face'] else 'X'}")
    print(f"  - Hands: {'O' if landmarks_info['hands'] else 'X'}")
    
    recognizer.release()
    return predicted_label, confidence

# 테스트
test_image = image_files[0] if image_files else None
if test_image:
    predict_single_image(test_image, use_ensemble=True)

In [None]:
# Cell 15: 모델 저장 (최종)

# 전체 모델 및 설정 저장
save_data = {
    # 모델 가중치
    'model_kp_state': model_kp.state_dict(),
    'model_cnn_state': model_cnn.state_dict(),
    
    # 정규화 파라미터
    'norm_params': norm_params,
    
    # 라벨 매핑
    'label_to_idx': label_to_idx,
    'idx_to_label': idx_to_label,
    'num_classes': num_classes,
    
    # 모델 설정
    'kp_input_size': TOTAL_DIM,  # 411
    'ensemble_weights': {'kp': 0.4, 'cnn': 0.6},
    
    # 키포인트 구조 정보
    'keypoint_structure': {
        'pose_dim': POSE_DIM,
        'face_dim': FACE_DIM,
        'hand_dim': HAND_DIM,
        'total_dim': TOTAL_DIM
    }
}

torch.save(save_data, os.path.join(MODEL_DIR, 'photo_finger_alphabet_411_model.pt'))

print("=== 모델 저장 완료 ===")
print(f"저장 위치: {MODEL_DIR}/photo_finger_alphabet_411_model.pt")
print(f"\n포함 내용:")
print(f"  - 키포인트 분류 모델 (MLP, 411차원 입력)")
print(f"  - CNN 이미지 분류 모델 (EfficientNet-B0)")
print(f"  - 정규화 파라미터")
print(f"  - 라벨 매핑 ({num_classes}개 클래스)")
print(f"\n키포인트 구조:")
print(f"  - Pose: {POSE_DIM} (25 landmarks * 3)")
print(f"  - Face: {FACE_DIM} (70 landmarks * 3)")
print(f"  - Left Hand: {HAND_DIM} (21 landmarks * 3)")
print(f"  - Right Hand: {HAND_DIM} (21 landmarks * 3)")
print(f"  - Total: {TOTAL_DIM}차원")