In [1]:
#Core notebook of the project. Here there is the webcam implementation, and the data processing done in real time utilizing the
#mediapipe hand landmarker and the classifier trained by me on its outputs


import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2

In [2]:
#mediapipe hand landmark visualization function, inputs: rgb image, results from the hand landmarker object
#output: modified image, with the hand landmarkers drawn on

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

In [3]:
#Downloads the mediapipe model from the internet
import requests

url = "https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task"
response = requests.get(url)

with open("hand_landmarker.task", "wb") as f:
    f.write(response.content)

print("File downloaded successfully.")

File downloaded successfully.


In [4]:
#downloads the model trained in the other notebook from the github
url = 'https://github.com/leviens/Live-human-gesture-recognition-system/raw/refs/heads/main/model/model_final.h5'

# Download the file and save it locally
response = requests.get(url)
with open('model_final.h5', 'wb') as f:
    f.write(response.content)
print("File downloaded succesfully")

File downloaded succesfully


In [5]:
#Creates a hand landmarker object
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

In [8]:
#Setup the libraries and model
import cv2
import tensorflow as tf
import os

#loaded_model = tf.keras.models.load_model(os.path.join('D:\progetto_video\model', 'model_deeper.h5'))
loaded_model = tf.keras.models.load_model('model_final.h5')

class_names = {
    -1: 'No sign',
    1: 'Thumb up',
    2: 'V sign',
    3: 'Three',
    4: 'Four',
    5: 'Five',
    6: 'Ok',
    7: 'No sign'
}



In [13]:
#Core part. Run this to use the webcam integration system
#Works on one hand at a time
#Press q to close the webcam window

def main():
    # Open a connection to the webcam (0 is usually the built-in webcam)
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return
    
    # Set the desired resolution (e.g., 1280x720)
    width = 1280 #1920 
    height = 720 #1080
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

    # Resize the display window
    cv2.namedWindow('Webcam Feed', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('Webcam Feed', width, height)

    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            print("Error: Could not read frame.")
            break

        #Trasform image into mediapipe object to run the hand landmarker
        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
        detection_result = detector.detect(image)
        img = draw_landmarks_on_image(frame, detection_result)
        
        hand_landmarks_list = detection_result.hand_landmarks
        handedness_list = detection_result.handedness
        mano=False
        #Extract the data that will be passed as input to the classifier
        for idx in range(len(hand_landmarks_list)):
            hand_landmarks = hand_landmarks_list[idx]
            handedness = handedness_list[idx][0].index
            #ignores left hand
            #if handedness == 1:
                #continue
            mano=True
            x_coordinates = [landmark.x for landmark in hand_landmarks]
            y_coordinates = [landmark.y for landmark in hand_landmarks]
            z_coordinates = [landmark.z for landmark in hand_landmarks]
        #Process the raw data
        if mano:
            features = [handedness] + x_coordinates + y_coordinates + z_coordinates
            #features = features.astype('float32')
            features = np.array(features)
            features /= np.max(features)
            features = features.reshape(1, -1)
            #Run the neural network and obtain prediction probabilities
            predicted_class = loaded_model.predict(features)

            #Select the higher probability class
            prob = predicted_class[0,:]
            classe = np.argmax(prob)
            if prob[classe] <0.85:
                classe = -2
            classe+=1
    
            # Display the resulting frame with the classification result
            class_name = class_names.get(classe)
            
            cv2.putText(img, f'Sign: {class_name}', (10, 30), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)

        cv2.imshow('Webcam Feed', img)

        # Break the loop on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()