In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [2]:
WORKSPACE_PATH = 'Tensorflow/workspace'
SCRIPTS_PATH= 'Tensorflow/scripts'
APIMODEL_PATH = 'Tensorflow/models' 
ANNOTATION_PATH = WORKSPACE_PATH+'/annotations' 
IMAGE_PATH = WORKSPACE_PATH+'/images' 
MODEL_PATH = WORKSPACE_PATH+'/models' 
PRETRAINED_MODEL_PATH = WORKSPACE_PATH+'/pre-trained-models' 
CONFIG_PATH = MODEL_PATH+'/my_ssd_mobnet/pipeline.config' 
CHECKPOINT_PATH = MODEL_PATH+'/my_ssd_mobnet/'

In [3]:
## Configurando Mediapipe Holistic 
# Variable para MPH: 
mp_holistic = mp.solutions.holistic 
# Variable para las drawing utilities de MP 
mp_drawing = mp.solutions.drawing_utils
def mediapipe_detection(image, model): 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
    image.flags.writeable = False 
    results = model.process(image)  
    image.flags.writeable = True 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 
    return image, results 

<h2>Código para dibujar en la pantalla</h2>

In [4]:
def draw_hand_bbox(image, hand_landmarks):
    if hand_landmarks is None:
        return  # No hacer nada si no hay landmarks

    h, w, _ = image.shape
    coords = [(int(lm.x * w), int(lm.y * h)) for lm in hand_landmarks.landmark]
    
    x_vals = [pt[0] for pt in coords]
    y_vals = [pt[1] for pt in coords]
    
    min_x, max_x = min(x_vals), max(x_vals)
    min_y, max_y = min(y_vals), max(y_vals)

    # Dibuja el rectángulo
    cv2.rectangle(image, (min_x, min_y), (max_x, max_y), (255, 255, 0), 2)

In [5]:
def draw_changed_landmarks(image, results): 
    # Para dibujar los landmarks de la cara (con sus conexiones) 
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, mp_drawing.DrawingSpec(color=(191,95,0), thickness=1, circle_radius=1), mp_drawing.DrawingSpec(color=(237,178,101), thickness=1, circle_radius=0.5))  
    
    # Para dibujar los landmarks de la postura (con sus conexiones) 
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, mp_drawing.DrawingSpec(color=(3,56,172), thickness=2, circle_radius=3), mp_drawing.DrawingSpec(color=(141,241,244), thickness=2, circle_radius=3))  
    
    # Para dibujar los landmarks de la mano izquierda (con sus conexiones) 
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(3,89,17), thickness=2, circle_radius=3), mp_drawing.DrawingSpec(color=(134,242,196), thickness=3, circle_radius=2)) 

    # Dibujar bounding box
    draw_hand_bbox(image, results.left_hand_landmarks)
    
     # Para dibujar los landmarks de la mano derecha (con sus conexiones) 
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(134,242,196), thickness=2, circle_radius=3), mp_drawing.DrawingSpec(color=(3,89,17), thickness=3, circle_radius=2)) 

    # Dibujar bounding box
    draw_hand_bbox(image, results.right_hand_landmarks)

<h2>Probando camara</h2>

In [6]:
###CAPTURAR LAS IMÁGENES POR CÁMARA
cap = cv2.VideoCapture(0) 
if not cap.isOpened():
    print("Error: No se pudo abrir la cámara.")
    exit()
#Para acceder al modelo mediapipe 
# configurando el modelo mediapipe: 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: 
    while cap.isOpened(): 
# 
        #Leyendo el feed 
        ret, frame = cap.read() #Cuando leemos obtenemos estos dos valores return y frame (la img de la cámara) 
# 
        #Detección (entre feed y renderizado) 
        image, results = mediapipe_detection(frame, holistic) #En vez de 'holistic' sería 'model' en general 
        print(results)
        #Dibujar landmarks (entre la detección y el display) 
        draw_changed_landmarks(image, results) 
        #Presentar en pantalla 
        cv2.imshow('Pantalla OpenCV', image)  
        #Para la "current key", sale del loop 
        if cv2.waitKey(10) & 0xFF == ((ord('q')) or ((ord('Q')))): 
            break 
    cap.release() 
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

<h2>Funcion Normalizacion</h2>

In [7]:
def normalize_hand_landmarks(hand_landmarks, image_shape):
    if hand_landmarks is None:
        return np.zeros(21 * 3)
    
    h, w, _ = image_shape
    coords = np.array([[lm.x * w, lm.y * h, lm.z * w] for lm in hand_landmarks.landmark])

    x_min, y_min = coords[:, 0].min(), coords[:, 1].min()
    x_max, y_max = coords[:, 0].max(), coords[:, 1].max()

    width = x_max - x_min
    height = y_max - y_min

    # Evita división por cero
    if width == 0 or height == 0:
        return np.zeros(21 * 3)

    # Normaliza entre 0 y 1 dentro del bounding box
    coords[:, 0] = (coords[:, 0] - x_min) / width
    coords[:, 1] = (coords[:, 1] - y_min) / height
    coords[:, 2] = coords[:, 2] / w  # mantén z relativa al ancho

    return coords.flatten()

<h2>Función de extraer keypoints (sin normalizar)</h2>

In [8]:
def extract_keypoints(results, image_shape): 
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4) 
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)  
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)  
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3) 

    return np.concatenate([pose, face, lh, rh])

<h2>RUTAS</h2>

In [9]:
# Ruta para los datos exportados, arreglos numpy 
DATA_PATH = os.path.join('Datos_Dataset_3') 
# Acciones (5) a detectar 
actions = np.array(['Hola', 'Gracias', 'Comprendo', 'Como estas', 'De nada'])
# 125 'videos' de datos (por seña) 
no_sequences = 125
# Tamaño/longitud de los videos (en frames) 
sequence_length = 30 

In [10]:
for action in actions: 
    for sequence in range(no_sequences): 
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence))) 
        except: 
            pass

<h2>Función de extraer Key points (normalizando)</h2>

In [11]:
def extract_keypoints_normalized(results, image_shape): 
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4) 
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)  
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)  
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3) 

    
    # Normalizados los datos de las manos
    lh_normalized = normalize_hand_landmarks(results.left_hand_landmarks, image_shape) if results.left_hand_landmarks else np.zeros(21*3)
    rh_normalized = normalize_hand_landmarks(results.right_hand_landmarks, image_shape) if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh_normalized, rh_normalized])

<h2>RUTA normalizada</h2>

In [12]:
# Ruta para los datos exportados, arreglos numpy 
DATA_PATH_NORMALIZED = os.path.join('Datos_Dataset_Normalized_3') 

In [13]:
for action in actions: 
    for sequence in range(no_sequences): 
        try: 
            os.makedirs(os.path.join(DATA_PATH_NORMALIZED, action, str(sequence))) 
        except: 
            pass

<h2>Prediccion</h2>

In [14]:
import tensorflow as tf

ruta_del_modelo = 'Modelo_Trad_Norm_EarlyStop_Aug1.h5'  # Asegúrate de que la ruta sea correcta

try:
    model = tf.keras.models.load_model(ruta_del_modelo)
    print("Modelo cargado exitosamente.")
    model.summary()  # Opcional: muestra la arquitectura del modelo
except Exception as e:
    print(f"Error al cargar el modelo: {e}")



Modelo cargado exitosamente.


In [20]:
sequence = []
sentence = []
predictions = []
threshold = 0.7

In [21]:
# Implementación del modelo de predicción 
cap = cv2.VideoCapture(0) 
#Para acceder al modelo mediapipe

# configurando el modelo mediapipe: 
with mp_holistic.Holistic(min_detection_confidence=0.8, min_tracking_confidence=0.6) as holistic: 
    while cap.isOpened():
        #Leyendo el feed 
        ret, frame = cap.read() #Cuando leemos obtenemos estos dos valores return y frame (la img de la cámara) 
        #
         #Detección 
        image, results = mediapipe_detection(frame, holistic) 
        print(results)      
        #
        #Dibujar landmarks (entre la detección y el display) 
        draw_changed_landmarks(image, results) 
        #    
         #2. Predicción 
        image_shape = image.shape
        keypoints = extract_keypoints_normalized(results, image_shape)
        sequence.append(keypoints) 
        sequence = sequence[-30:] # se toman los últimos 30 frames para hacer la predicción correspondiente 
        #
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0] 
            print(actions[np.argmax(res)])
            print(res[np.argmax(res)])
            print(actions)
            print(np.array(res))
            predictions.append(np.argmax(res)) 
            #
            #3. Lógica de visualización 
            if np.unique(predictions[-8:])[0] == np.argmax(res) and res[np.argmax(res)] > threshold:
                action = actions[np.argmax(res)]
                if len(sentence) == 0 or action != sentence[-1]:
                    sentence.append(action)
        
            # Mantener solo las últimas 3 palabras
            sentence = sentence[-3:]
                
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1) #-1 significa que llena el rectángulo
        cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA) 
        cv2.imshow('Pantalla OpenCV', image) # antes cv2.imshow('OpenCV Feed', frame) porque hacía rendering del frame  
        if cv2.waitKey(10) & 0xFF == ((ord('q')) or ((ord('Q')))): 
            break 
    cap.release() 
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

<h1>Detectando manos<h1>

In [16]:
# Implementación del modelo de predicción 
cap = cv2.VideoCapture(0) 
#Para acceder al modelo mediapipe
# configurando el modelo mediapipe: 
with mp_holistic.Holistic(min_detection_confidence=0.8, min_tracking_confidence=0.6) as holistic: 
    while cap.isOpened():
        #Leyendo el feed 
        ret, frame = cap.read() #Cuando leemos obtenemos estos dos valores return y frame (la img de la cámara) 
        #
         #Detección 
        image, results = mediapipe_detection(frame, holistic) 
        #
        #Dibujar landmarks (entre la detección y el display) 
        draw_changed_landmarks(image, results) 
        #    
        # Verificar si se detectan manos
        hands_detected = (results.left_hand_landmarks is not None or 
                         results.right_hand_landmarks is not None)
        
        if hands_detected:
            #2. Predicción 
            image_shape = image.shape
            keypoints = extract_keypoints_normalized(results, image_shape)
            sequence.append(keypoints) 
            sequence = sequence[-30:] # se toman los últimos 30 frames para hacer la predicción correspondiente 
            #
            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0] 
                
                # Salida formateada
                print(actions[np.argmax(res)])
                print(res[np.argmax(res)])
                print(actions)
                print(res)
                
                predictions.append(np.argmax(res)) 
                #
                #3. Lógica de visualización 
                if np.unique(predictions[-8:])[0] == np.argmax(res) and res[np.argmax(res)] > threshold:
                    action = actions[np.argmax(res)]
                    if len(sentence) == 0 or action != sentence[-1]:
                        sentence.append(action)
            
                # Mantener solo las últimas 3 palabras
                sentence = sentence[-3:]
        else:
            # Limpiar la secuencia cuando no hay manos detectadas
            sequence = []
            
        # Mostrar estado de detección
        status_text = "MANOS DETECTADAS" if hands_detected else "Esperando manos..."
        status_color = (0, 255, 0) if hands_detected else (0, 0, 255)
        cv2.putText(image, status_text, (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7, status_color, 2, cv2.LINE_AA)
                
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1) #-1 significa que llena el rectángulo
        cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA) 
        cv2.imshow('Pantalla OpenCV', image) # antes cv2.imshow('OpenCV Feed', frame) porque hacía rendering del frame  
        if cv2.waitKey(10) & 0xFF == ((ord('q')) or ((ord('Q')))): 
            break 
    cap.release() 
    cv2.destroyAllWindows()