In [68]:
# Core notebook of the project.
# The following blocks contains the live webcam implementation of the classifier built upon the mediapipe hand landmarker model.
# First, the trained classifier is loaded, and then the mediapipe hands model.
# The main function then accesses the local webcam, processes the images frame by frame,
# and outputs the hand gesture being executed by the user, if any. 


import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2


In [69]:
import os
import requests


#downloads the model trained in the other notebook from the github
url = 'https://github.com/leviens/Live-human-gesture-recognition-system/raw/refs/heads/main/model/model_new.h5'

# Download the file and save it locally
response = requests.get(url)
with open('model_final.h5', 'wb') as f:
    f.write(response.content)
print("File downloaded succesfully")

File downloaded succesfully


In [70]:
#Setup the libraries and load the model
import cv2
import tensorflow as tf
import os

#loaded_model = tf.keras.models.load_model('D:\progetto_video\model_new\model_new.h5')

loaded_model = tf.keras.models.load_model('model_final.h5')

# Define the name of the gestures
# Additional info on the gestures on the github documentation

class_names = {
    -1: 'No sign',
    1: 'Thumb up',
    2: 'V sign',
    3: 'Three',
    4: 'Four',
    5: 'Five',
    6: 'Ok',
    7: 'Rock sign',
    8: 'No sign'
}



In [71]:
# Loads the mediapipe hands model and the drawing utils

import mediapipe as mp
import cv2
import numpy as np


mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

def draw_landmarks_on_image(image, hand_landmarks):
    """Draws hand landmarks on the provided image."""
    mp_drawing.draw_landmarks(
        image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    return image


In [72]:
# Main function: run this block to use the real time webcam gesture recognition system.
# NOTE: close the window by pressing the "q" key on your keyboard.
# Classification results are written on the webcam window: in the upper left corner for the left hand, in the upper right corner for the right hand.
# The implementation works simultaneusly on both hands.


def main():
    """Captures video from webcam, performs hand landmark detection,
    and classifies hand gestures in live stream mode."""

    # Open a connection to the webcam:
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    # Set the desired resolution. Works with multiple resolutions, may need to adapt the text writing function at the end. It should work 
    # with HD and full HD with those settings
    width =  1280 #1920 
    height = 720 #1080 
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

    # Resize the display window
    cv2.namedWindow('Webcam Feed', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('Webcam Feed', width, height)

    # main loop
    with mp_hands.Hands(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5) as hands: # mediapipe model
        while True:
            # Capture frame-by-frame
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break

            # Flip the image horizontally for a later selfie-view display
            frame = cv2.flip(frame, 1)

            # Convert frame to RGB format for MediaPipe
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False # Improve performance
            results = hands.process(image) # Extract hand landmarks
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

            # Works on both hands, simultaneusly
            if results.multi_hand_landmarks:
                for hand_landmarks, handedness_info in zip(results.multi_hand_landmarks, results.multi_handedness):
                    handedness = handedness_info.classification[0].label

                    # Extract x, y, and z coordinates of landmarks
                    x_coordinates = [landmark.x for landmark in hand_landmarks.landmark]
                    y_coordinates = [landmark.y for landmark in hand_landmarks.landmark]
                    z_coordinates = [landmark.z for landmark in hand_landmarks.landmark]

                    # Process the raw data: eventual normalization procedure. Not done here, see documentations
                    
                    # Convert handedness to numerical representation
                    handedness_num = 0 if handedness == "Right" else 1 # 1 for right, 0 for left
                    features = [handedness_num] + x_coordinates + y_coordinates + z_coordinates
                    features = np.array(features, dtype=np.float32) # Crucial: specify dtype

                    features = features.reshape(1, -1)

                    # Predict gesture class using the loaded model
                    predicted_class = loaded_model.predict(features)
                
                    # Select the higher probability class
                    prob = predicted_class[0, :]
                    classe = np.argmax(prob)
                    if prob[classe] < 0.85:
                        classe = -2
                    classe += 1

                    # Draw the landmarks (optional)
                    image = draw_landmarks_on_image(image, hand_landmarks)

                    # Display the classification result on the frame: upper left corner for the left hand, upper right corner for the right hand
                    class_name = class_names.get(classe)
                    text_x = 30 if handedness == "Left" else image.shape[1] - 400  # Left or Right corner
                    text_y = 50 # y position of the text
                    
                    cv2.putText(image, f'Sign: {class_name}', (text_x, text_y),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 255), 3, cv2.LINE_AA)

            cv2.imshow('Webcam Feed', image)

            # Break the loop on 'q' key press
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34