In [1]:
pip install numpy opencv-python mediapipe pycaw comtypes

Collecting mediapipe
  Obtaining dependency information for mediapipe from https://files.pythonhosted.org/packages/c1/0f/4dc0802131756a9fe4d46d2824352014b85a75baca386cb9e43057f39f15/mediapipe-0.10.14-cp311-cp311-win_amd64.whl.metadata
  Using cached mediapipe-0.10.14-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting pycaw
  Obtaining dependency information for pycaw from https://files.pythonhosted.org/packages/c2/e2/89e3e096d8926f19cbcf2991ae86d19e6705ea75ad0212862461cb4b83d8/pycaw-20240210-py3-none-any.whl.metadata
  Downloading pycaw-20240210-py3-none-any.whl.metadata (1.7 kB)
Collecting jax (from mediapipe)
  Obtaining dependency information for jax from https://files.pythonhosted.org/packages/49/48/0e32458ab7e02d75f423fe8c2ab10d7fa1aba9b314391d2659e68891912b/jax-0.4.33-py3-none-any.whl.metadata
  Using cached jax-0.4.33-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib (from mediapipe)
  Obtaining dependency information for jaxlib from https://files.pythonhosted.org/packages

In [3]:
import cv2
import numpy as np
import mediapipe as mp
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
from comtypes import CLSCTX_ALL
from ctypes import cast, POINTER


mp_hands = mp.solutions.hands.Hands()


def get_default_audio_device():
    devices = AudioUtilities.GetSpeakers()  
    interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
    volume = cast(interface, POINTER(IAudioEndpointVolume))
    return volume

volume = get_default_audio_device()  
vol_range = volume.GetVolumeRange()
min_vol, max_vol, _ = vol_range  

def get_fingertip_distance(landmarks, finger_tip_1, finger_tip_2):

    x1, y1 = landmarks[finger_tip_1][:2] 
    x2, y2 = landmarks[finger_tip_2][:2] 
    distance = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
    return distance

def process_frame(image):

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    results = mp_hands.process(image_rgb)

    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        landmarks = [(lm.x, lm.y) for lm in hand_landmarks.landmark]

        thumb_tip = mp.solutions.hands.HandLandmark.THUMB_TIP
        index_tip = mp.solutions.hands.HandLandmark.INDEX_FINGER_TIP
        distance = get_fingertip_distance(landmarks, thumb_tip, index_tip)

        vol = np.interp(distance, [0.02, 0.15], [min_vol, max_vol])  
        vol = np.clip(vol, min_vol, max_vol)
        volume.SetMasterVolumeLevel(vol, None)

    return image 

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    processed_frame = process_frame(frame)

    cv2.imshow('Hand Gesture Volume Control', processed_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
