**Pakistan Sign Language Detection**

**Packages**
- opencv-python
- cvzone
- mediapipe
- numpy
- pyttsx3 (text to speech)
- gtts (Google Translate text-to-speech)

Without Voice

In [None]:
import cv2
from cvzone.HandTrackingModule import HandDetector
from cvzone.ClassificationModule import Classifier
import numpy as np
import math
import pyttsx3

cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
classifier = Classifier("Model/keras_model.h5", "Model/labels.txt")

offset = 20
imgSize = 300  
# Labels 
labels = ["Alif", "Bay", "Pay", 'Tey', 'Tay', 'Thay', 'Jeem', 'Chay', 'Hey', 'Khay', 'Daal',
          'Ddaal', 'Dhaal', 'Ray', 'aRay', 'Zay', 'Djay', 'Seen', 'Sheen', 'Suaad', 'Dhwaad', 
          "Toay'n", "Zoay'n", 'Ain', 'Ghain', 'Fay', 'Quaaf', 'Kaaf', 'Gaaf', 'Laam', 'Meem', 'Noon', 
          'Vao', 'Hay', 'Hamza', 'Chooti yeh', 'Bari yeh']

while True: 
    success, img = cap.read()
    imgOutput = img.copy()
    hands, img = detector.findHands(img)

    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']

        imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255
        y1 = max(0, y - offset)
        y2 = min(img.shape[0], y + h + offset)
        x1 = max(0, x - offset)
        x2 = min(img.shape[1], x + w + offset)
        imgCrop = img[y1:y2, x1:x2]

        aspectRatio = h / w

        if aspectRatio > 1:
            k = imgSize / h
            wCal = math.ceil(k * w)
            imgResize = cv2.resize(imgCrop, (wCal, imgSize))
            wGap = math.ceil((imgSize - wCal) / 2)
            imgWhite[:, wGap:wCal + wGap] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)
            print(prediction, index)
            
        else:
            k = imgSize / w
            hCal = math.ceil(k * h)
            imgResize = cv2.resize(imgCrop, (imgSize, hCal))
            hGap = math.ceil((imgSize - hCal) / 2)
            imgWhite[hGap:hCal + hGap, :] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)
            

        # Confidence score (percentage)
        confidence = int(prediction[index] * 100)

        # Draw label + confidence %
        cv2.rectangle(imgOutput, (x - offset, y - offset - 50),
                      (x - offset + 250, y - offset - 50 + 50), (255, 0, 255), cv2.FILLED)
        cv2.putText(imgOutput, f"{labels[index]} {confidence}%", (x, y - 26),
                    cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 255, 255), 2)

        cv2.rectangle(imgOutput, (x - offset, y - offset),
                      (x + w + offset, y + h + offset), (255, 0, 255), 4)

        cv2.imshow("ImageCrop", imgCrop)
        cv2.imshow("ImageWhite", imgWhite)

    cv2.imshow("Pakistan Sign Language Detection", imgOutput)
    cv2.waitKey(1)


Add Voice

In [None]:
import cv2
from cvzone.HandTrackingModule import HandDetector
from cvzone.ClassificationModule import Classifier
import numpy as np
import math
from gtts import gTTS
import os
import playsound
import threading
import pyttsx3
import time




cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
classifier = Classifier("Model/keras_model.h5", "Model/labels.txt")

offset = 20
imgSize = 300

labels = ["Alif", "Bay", "Pay", 'Tey', 'Tay', 'Thay', 'Jeem', 'Chay', 'Hey', 'Khay', 'Daal',
          'Ddaal', 'Dhaal', 'Ray', 'aRay', 'Zay', 'Djay', 'Seen', 'Sheen', 'Suaad', 'Dhwaad',
          "Toay'n", "Zoay'n", 'Ain', 'Ghain', 'Fay', 'Quaaf', 'Kaaf', 'Gaaf', 'Laam', 'Meem', 
          'Noon', 'Vao', 'Hay', 'Hamza', 'Chooti yeh', 'Bari yeh']

engine = pyttsx3.init()
engine.setProperty('rate', 150)
engine.setProperty('volume', 1)

last_speak_time = 0
speak_delay = 2   # 2 seconds wait

# function for threaded speaking 
def speak(text):
    def run():
        tts = gTTS(text=text, lang='ur')   # language "Urdu"
        filename = "temp.mp3"
        tts.save(filename)
        playsound.playsound(filename)
        os.remove(filename)
    threading.Thread(target=run).start()

while True:
    success, img = cap.read()
    imgOutput = img.copy()
    hands, img = detector.findHands(img)

    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']

        imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255
        y1 = max(0, y - offset)
        y2 = min(img.shape[0], y + h + offset)
        x1 = max(0, x - offset)
        x2 = min(img.shape[1], x + w + offset)
        imgCrop = img[y1:y2, x1:x2]

        aspectRatio = h / w

        if aspectRatio > 1:
            k = imgSize / h
            wCal = math.ceil(k * w)
            imgResize = cv2.resize(imgCrop, (wCal, imgSize))
            wGap = math.ceil((imgSize - wCal) / 2)
            imgWhite[:, wGap:wCal + wGap] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)
        else:
            k = imgSize / w
            hCal = math.ceil(k * h)
            imgResize = cv2.resize(imgCrop, (imgSize, hCal))
            hGap = math.ceil((imgSize - hCal) / 2)
            imgWhite[hGap:hCal + hGap, :] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)

        confidence = int(prediction[index] * 100)
        label = labels[index]

        
        cv2.rectangle(imgOutput, (x - offset, y - offset - 50),
                      (x - offset + 250, y - offset - 50 + 50), (255, 0, 255), cv2.FILLED)
        cv2.putText(imgOutput, f"{label} {confidence}%", (x, y - 26),
                    cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 255, 255), 2)

        cv2.rectangle(imgOutput, (x - offset, y - offset),
                      (x + w + offset, y + h + offset), (255, 0, 255), 4)

        # Speech
        current_time = time.time()
        if confidence > 80 and (current_time - last_speak_time) > speak_delay:
            speak(label) 
            last_speak_time = current_time
       

        cv2.imshow("ImageCrop", imgCrop)
        cv2.imshow("ImageWhite", imgWhite)

    cv2.imshow("Pakistan Sign Language Detection", imgOutput)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
