YOLO로 사람/총/칼 인식 + Arcface로 얼굴 인식 + MediaPipe로 포즈 인식

In [1]:
#1. 얼굴 학습시키는 부분
import cv2
import numpy as np
import os
from insightface.app import FaceAnalysis

def initialize_arcface():
    app = FaceAnalysis(name="buffalo_l")  # ArcFace 모델 (buffalo_l은 기본 권장)
    app.prepare(ctx_id=-1, det_size=(640, 640))  # GPU: ctx_id=0, CPU: -1
    return app

def get_face_embedding(app, image_bgr):
    # ArcFace의 app.get()은 BGR 형식으로 이미지를 받기도 합니다.
    # 만약 RGB가 필요하면 cvtColor로 변환하세요.
    faces = app.get(image_bgr)
    if len(faces) > 0:
        return faces[0].embedding  # 첫 번째 얼굴의 임베딩
    else:
        return None

def generate_average_embedding(app, folder_path):
    embeddings = []
    for file in os.listdir(folder_path):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            img_path = os.path.join(folder_path, file)
            image = cv2.imread(img_path)
            if image is None:
                print(f"이미지 로드 실패: {img_path}")
                continue
            
            embedding = get_face_embedding(app, image)
            if embedding is not None:
                embeddings.append(embedding)
            else:
                print(f"얼굴 검출 실패: {img_path}")
    
    if len(embeddings) == 0:
        raise ValueError("임베딩을 하나도 생성하지 못했습니다.")
    
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

if __name__ == "__main__":
    app = initialize_arcface()
    # 내 얼굴 사진 폴더
    my_face_folder = "C:/Users/idea0/EE101/Jongsul/myface"  
    my_face_embedding = generate_average_embedding(app, my_face_folder)
    np.save("my_face_embedding.npy", my_face_embedding)  # 필요 시 저장
    print("내 얼굴 평균 임베딩 생성 완료.")

  check_for_updates()


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


In [12]:
#pose 감지
#왼팔 들기/오른팔 들기/양팔 들기/그외 로 나누어서 감지한다
import cv2
import mediapipe as mp
import time
import torch
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as display

from ultralytics import YOLO
from sklearn.metrics.pairwise import cosine_similarity
from insightface.app import FaceAnalysis

#######################
# 1) YOLO모델로드 및 ArcFace함수 초기화
#######################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# YOLOv8 모델 로드
model = YOLO("yolov8n.pt").to(device)
#model = YOLO("C:/Users/idea0/EE101/Jongsul/Yolomodels/epoch11.pt")

# ArcFace 초기화
arc_app = FaceAnalysis(name="buffalo_l")
arc_app.prepare(ctx_id=-1, det_size=(640,640))  # CPU 사용 (ctx_id=-1)

# 내 얼굴 임베딩 로드 (Numpy 파일로부터)
my_face_embedding = np.load("my_face_embedding.npy")

def get_face_embedding(arc_app, face_img_bgr):
    faces = arc_app.get(face_img_bgr)
    if len(faces) == 0:
        return None
    return faces[0].embedding

def is_my_face(face_embedding, my_embedding, threshold=0.4):
    sim = cosine_similarity([face_embedding], [my_embedding])[0][0]
    return (sim > threshold), sim

#######################
# 2) MediaPipe Pose 초기화
#######################
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
    static_image_mode=False,
    model_complexity=1,
    enable_segmentation=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

# 랜드마크 인덱스 (MediaPipe Pose)
LEFT_SHOULDER = 11
RIGHT_SHOULDER = 12
LEFT_WRIST = 15
RIGHT_WRIST = 16

def is_arm_raised(shoulder_y, wrist_y, threshold=0.05):
    """
    어깨 y좌표보다 손목 y좌표가 특정 기준만큼 더 위(작은 값)이면 '팔을 들었다'고 판단.
    - Mediapipe Pose의 y값은 화면 상단이 0, 하단이 1. 작을수록 더 위쪽.
    - threshold는 오차 보정용(예: 0.05)
    """
    # 손목이 어깨보다 좀 더 위에 있으면 True
    return wrist_y < (shoulder_y - threshold)

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("카메라를 열 수 없습니다.")
    exit()
    
prev_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        print("프레임을 읽어올 수 없습니다.")
        break

    # BGR -> RGB
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Pose 추론
    results = pose.process(rgb)

    # 동작판별 결과 메세지
    action_text = ""

    if results.pose_landmarks:
        landmarks = results.pose_landmarks.landmark

        # 왼/오른 어깨, 손목의 y좌표
        left_shoulder_y = landmarks[LEFT_SHOULDER].y
        right_shoulder_y = landmarks[RIGHT_SHOULDER].y
        left_wrist_y = landmarks[LEFT_WRIST].y
        right_wrist_y = landmarks[RIGHT_WRIST].y

        # 왼/오른팔 들었는지 판별
        left_arm_up = is_arm_raised(left_shoulder_y, left_wrist_y, threshold=0.05)
        right_arm_up = is_arm_raised(right_shoulder_y, right_wrist_y, threshold=0.05)

        if left_arm_up and right_arm_up:
            action_text = "both arms up"
        elif left_arm_up:
            action_text = "left arm up"
        elif right_arm_up:
            action_text = "right arm up"
        else:
            action_text = "do nothing"

        # 랜드마크 + 연결선 그리기
        mp_drawing.draw_landmarks(
            frame,
            results.pose_landmarks,
            mp_pose.POSE_CONNECTIONS
        )

    # 동작 결과 표시
    cv2.putText(frame, action_text, (30, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)
    
    # YOLOv8 추론 (주의: BGR -> RGB 변환)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = model(rgb_frame)

    # 바운딩 박스 처리
    for box in results[0].boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        class_id = int(box.cls)
        conf = float(box.conf)
        class_name = model.names[class_id]
            
        # 바운딩 박스 표시
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
        label = f"{class_name}: {conf:.2f}"
        cv2.putText(frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 2)

        # 0 : person
        if class_id == 0:
            person_crop = frame[y1:y2, x1:x2]  # BGR
            if person_crop.size == 0:
                continue

            face_embedding = get_face_embedding(arc_app, person_crop)
            if face_embedding is not None:
                same_person, sim = is_my_face(face_embedding, my_face_embedding, threshold=0.4)

                if same_person:
                    label = f"               Me! (sim={sim:.2f})"
                    color = (0, 255, 0)
                else:
                    label = f"               Not me (sim={sim:.2f})"
                    color = (0, 0, 255)

                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, label, (x1, y1-10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # FPS 계산
    current_time = time.time()
    fps = 1 / (current_time - prev_time)
    prev_time = current_time
    cv2.putText(frame, f"FPS: {fps:.2f}", (10,30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)    
    
    
    

    cv2.imshow("Yolo_Arcface_MediaPipe", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\idea0/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4



0: 480x640 1 person, 100.0ms
Speed: 3.0ms preprocess, 100.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4



0: 480x640 1 person, 95.0ms
Speed: 2.0ms preprocess, 95.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 90.0ms
Speed: 1.0ms preprocess, 90.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4



0: 480x640 1 person, 109.0ms
Speed: 3.0ms preprocess, 109.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 93.0ms
Speed: 1.0ms preprocess, 93.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 78.0ms
Speed: 2.0ms preprocess, 78.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 80.0ms
Speed: 2.0ms preprocess, 80.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4



0: 480x640 1 person, 98.0ms
Speed: 3.0ms preprocess, 98.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 82.0ms
Speed: 1.0ms preprocess, 82.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 84.0ms
Speed: 3.0ms preprocess, 84.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4



0: 480x640 1 person, 84.0ms
Speed: 5.0ms preprocess, 84.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 82.0ms
Speed: 1.0ms preprocess, 82.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 79.3ms
Speed: 1.0ms preprocess, 79.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 93.0ms
Speed: 2.0ms preprocess, 93.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 91.0ms
Speed: 2.0ms preprocess, 91.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 90.0ms
Speed: 2.0ms preprocess, 90.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 94.0ms
Speed: 3.0ms preprocess, 94.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 96.0ms
Speed: 3.0ms preprocess, 96.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 107.0ms
Speed: 1.0ms preprocess, 107.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)



  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0: 480x640 1 person, 95.0ms
Speed: 5.0ms preprocess, 95.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
