In [16]:
import cv2
import numpy as np
import os
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout

mp_hands = mp.solutions.hands  # ✅ فقط دست‌ها
mp_drawing = mp.solutions.drawing_utils

path_data = ".gitignore\.asl_alphabet_train"
labels = []

  path_data = ".gitignore\.asl_alphabet_train"


In [17]:
class Model:
    def __init__(self):
        self.model = None

    def extract_hand_feature_vector(self, results):
        """
        ورودی:
            results: خروجی mediapipe Hands (hand_landmarks, handedness)
        خروجی:
            feature_vector: آرایه numpy با شکل (134,)
                            شامل مختصات نرمال‌شده و زوایای بین انگشتان هر دو دست
        """

        def normalize_hand(hand_landmarks, hand_label):
            coords = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

            # --- 1. انتقال مبدأ به مچ ---
            wrist = coords[0].copy()
            coords -= wrist

            # --- 2. ساخت محورهای کف دست ---
            index_mcp = coords[5]
            pinky_mcp = coords[17]
            palm_normal = np.cross(index_mcp, pinky_mcp)
            palm_normal /= np.linalg.norm(palm_normal) + 1e-9

            x_axis = index_mcp / (np.linalg.norm(index_mcp) + 1e-9)
            y_axis = np.cross(palm_normal, x_axis)
            y_axis /= np.linalg.norm(y_axis) + 1e-9
            z_axis = np.cross(x_axis, y_axis)
            z_axis /= np.linalg.norm(z_axis) + 1e-9

            R = np.vstack([x_axis, y_axis, z_axis]).T
            coords = coords @ R

            # --- 3. نرمال‌سازی اندازه ---
            scale = np.linalg.norm(coords[9])  # wrist تا middle_mcp
            coords /= (scale + 1e-9)

            # --- 4. آینه‌سازی برای دست چپ ---
            if hand_label.lower() == "left":
                coords[:, 0] *= -1

            return coords


        def compute_finger_angles(coords):
            """محاسبه زوایای بین انگشتان اصلی"""
            ids = [4, 8, 12, 16, 20]  # انتهای انگشتان
            vecs = [coords[i] - coords[0] for i in ids]
            angles = []
            for i in range(len(vecs) - 1):
                v1, v2 = vecs[i], vecs[i + 1]
                dot = np.dot(v1, v2)
                norm = np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-8
                cos_theta = np.clip(dot / norm, -1.0, 1.0)
                angles.append(np.degrees(np.arccos(cos_theta)))
            return np.array(angles, dtype=float)

        # مقادیر پیش‌فرض برای زمانی که دست دیده نشود
        left_coords = np.zeros((21, 3))
        right_coords = np.zeros((21, 3))
        left_angles = np.zeros(4)
        right_angles = np.zeros(4)

        if results and getattr(results, "multi_hand_landmarks", None):
            for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
                # --- گرفتن label دست ---
                hand_label = results.multi_handedness[idx].classification[0].label  # 'Left' یا 'Right'

                # --- نرمال‌سازی ---
                coords = normalize_hand(hand_landmarks, hand_label)

                # --- محاسبه زاویه‌ها ---
                angles = compute_finger_angles(coords)

                # --- جایگذاری در خروجی ---
                if hand_label == "Left":
                    left_coords, left_angles = coords, angles
                else:
                    right_coords, right_angles = coords, angles

        # تخت‌سازی برای مدل
        feature_vector = np.concatenate([
            left_coords.flatten(), left_angles,
            right_coords.flatten(), right_angles
        ])

        return feature_vector
    
    def train(self, dataX_seq, dataY_seq, timesteps=30, features=134):
        if self.model == None:
            self.model = Sequential([
                LSTM(256, return_sequences=True, activation='relu', input_shape=(timesteps, features)),
                LSTM(128, return_sequences=False, activation='relu'),
                Dense(128, activation='relu'),
                Dropout(0.5),
                Dense(64, activation='relu'),
                Dense(len(labels), activation='softmax')
            ])


            self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
                        loss='sparse_categorical_crossentropy',
                        metrics=['accuracy'])

            self.model.fit(dataX_seq, dataY_seq, epochs=20, batch_size=32, verbose=1)
            self.model.save("ASL_letters_model.h5")
        # self.model = load_model("ASL_letters_model.h5")
        return
    def predict(self, input_seq):
        prediction = self.model.predict(input_seq, verbose=0)
        pred_class = np.argmax(prediction)
        return labels[pred_class]

In [18]:
ASL = Model()

In [19]:
labels = []
def data_to_array(path_data):
        global labels
        all_data_X, all_data_Y = [], []

        with mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5) as hands:
            for folder in os.listdir(path_data):
                folder_path = os.path.join(path_data, folder)
                if not os.path.isdir(folder_path):
                    continue
                if folder not in ['del', 'space', 'nothing']:
                    if folder not in labels:
                        labels.append(folder)
                    for filename in os.listdir(folder_path):
                        if 460 <= int(filename[1:-4]) <= 570:
                            file_path = os.path.join(folder_path, filename)
                            image = cv2.imread(file_path)
                            if image is None:
                                continue
                            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                            results = hands.process(image_rgb)
                            features = ASL.extract_hand_feature_vector(results)
                            all_data_X.append(features)
                            all_data_Y.append(labels.index(folder))

        return np.array(all_data_X), np.array(all_data_Y)

dataX, dataY = data_to_array(path_data)
print(labels)

timesteps = 30
features = 134

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [20]:
# ساخت توالی‌ها
sequences, labels_seq = [], []
for i in range(len(dataX) - timesteps + 1):
    sequences.append(dataX[i:i+timesteps])
    labels_seq.append(dataY[i+timesteps-1])

dataX_seq = np.array(sequences)
dataY_seq = np.array(labels_seq)

In [None]:
import time
ASL.train(dataX_seq, dataY_seq)
cap = cv2.VideoCapture(0)
fps = cap.get(cv2.CAP_PROP_FPS)

sequence_buffer = []
cooldown_frames = 5  # تعداد فریم‌هایی که نمایش داده نمی‌شود
frame_counter = 0
last_label = 'B'

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(image_rgb)

        features = ASL.extract_hand_feature_vector(results)
        sequence_buffer.append(features)
        frame_counter += 1
        if len(sequence_buffer) > 30:
            sequence_buffer.pop(0)

        if len(sequence_buffer) == 30:
            input_seq = np.expand_dims(sequence_buffer, axis=0)
            before = time.time()
            if frame_counter % 5 == 0:
                pred_class = ASL.predict(input_seq)
            after = time.time()
            print(after - before)
            current_label = pred_class
            
            cv2.putText(frame, f'{current_label}', (10, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # رسم دست‌ها
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2),
                    mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=1)
                )

        cv2.imshow('ASL Detection (Hands Only)', frame)

        if cv2.getWindowProperty("ASL Detection (Hands Only)", cv2.WND_PROP_VISIBLE) < 1:
            break
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
print("FPS=", fps)
cap.release()
cv2.destroyAllWindows()

  super().__init__(**kwargs)


Epoch 1/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 68ms/step - accuracy: 0.0386 - loss: 17.4365
Epoch 2/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.0634 - loss: 8.6237
Epoch 3/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 66ms/step - accuracy: 0.1091 - loss: 4.7687
Epoch 4/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.1205 - loss: 3.6677
Epoch 5/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.1730 - loss: 3.3476
Epoch 6/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 64ms/step - accuracy: 0.2047 - loss: 3.6187
Epoch 7/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 65ms/step - accuracy: 0.2405 - loss: 3.0943
Epoch 8/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 65ms/step - accuracy: 0.3213 - loss: 3.0246
Epoch 9/20
[1m90/90[0m [32m━━━━━━━━━━━━━━━━



0.38397955894470215
0.0
0.0
0.0
0.0
0.11176133155822754
0.0
0.0
0.0
0.0
0.09060478210449219
0.0
0.0
0.0
0.0
0.09409952163696289
0.0
0.0
0.0
0.0
0.1143038272857666
0.0
0.0
0.0
0.0
0.0942375659942627
0.0
0.0
0.0
0.0
0.10692977905273438
0.0
0.0
0.0
0.0
0.10817289352416992
0.0
0.0
0.0
0.0
0.10036897659301758
0.0
0.0
0.0
0.0
0.09533190727233887
0.0
0.0
0.0
0.0
0.09903264045715332
0.0
0.0
0.0
0.0
0.10447382926940918
0.0
0.0
0.0
0.0
0.11275124549865723
0.0
0.0
0.0
0.0
0.09215164184570312
0.0
0.0
0.0
0.0
0.10158777236938477
0.0
0.0
0.0
0.0
0.0960845947265625
0.0
0.0
0.0
0.0
0.10825037956237793
0.0
0.0
0.0
0.0
0.10089349746704102
0.0
0.0
0.0
0.0
0.1066732406616211
0.0
0.0
0.0
0.0
0.10118913650512695
0.0
0.0
0.0
0.0
0.09275317192077637
0.0
0.0
0.0
0.0
0.0924062728881836
0.0
0.0
0.0
0.0
0.09729170799255371
0.0
0.0
0.0
0.0
0.08982181549072266
0.0
0.0
0.0
0.0
0.1049797534942627
0.0
0.0
0.0
0.0
0.0965726375579834
0.0
0.0
0.0
0.0
0.09683871269226074
0.0
0.0
0.0
0.0
0.10262584686279297
0.0
0.0
0.0
0.0

Words part:

In [None]:
def extract_hand_features(results, feature_length=126):
    """
    خروجی: feature vector با طول ثابت
    """
    left_hand = np.zeros(63)   # 21 نقاط × 3
    right_hand = np.zeros(63)

    if results.multi_hand_landmarks:
        for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
            coords = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])
            # نرمال‌سازی ساده: فاصله تا مچ
            wrist = coords[0].copy()
            coords -= wrist
            scale = np.linalg.norm(coords[9]) + 1e-9
            coords /= scale
            coords[:, 2] *= 0.3
            coords = coords.flatten()
            if idx == 0:
                left_hand[:coords.shape[0]] = coords
            elif idx == 1:
                right_hand[:coords.shape[0]] = coords

    features = np.concatenate([left_hand, right_hand])
    # اطمینان از طول ثابت
    if features.shape[0] < feature_length:
        padded = np.zeros(feature_length)
        padded[:features.shape[0]] = features
        features = padded
    elif features.shape[0] > feature_length:
        features = features[:feature_length]

    return features

def video_to_array(video_path, feature_length=126):
    cap = cv2.VideoCapture(video_path)
    features_list = []

    with mp_hands.Hands(static_image_mode=False, max_num_hands=2,
                        min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(image_rgb)
            feature_vector = extract_hand_features(results, feature_length)
            features_list.append(feature_vector)

    cap.release()
    return np.array(features_list)  # شکل (num_frames, feature_length)

In [None]:
def pad_sequences(data, pad_value=0):
    # پیدا کردن بیشترین طول (تعداد فریم‌ها)
    max_len = max(len(arr) for arr in data)
    # پیدا کردن تعداد ویژگی‌ها از یک نمونه
    feature_dim = data[0].shape[1]
    # پدینگ برای هر نمونه
    padded_data = []
    for arr in data:
        padded_arr = np.pad(
            arr,
            ((0, max_len - len(arr)), (0, 0)),  # پدینگ به شکل (frames, features)
            constant_values=pad_value
        )
        padded_data.append(padded_arr)
    return np.array(padded_data)

def data_to_array2(path="ASLWL"):
    all_data_X = []
    all_data_Y = []

    with mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5) as hands:
        for folder in os.listdir(path):
            if folder == 'videos':
                folder_path = os.path.join(path, folder)
                for video in os.listdir(folder_path):
                    video_path = os.path.join(folder_path, video)
                    if int(video[:-4]) <= 600:
                        array = video_to_array(video_path)
                        all_data_X.append(array)
                        all_data_Y.append(video[:-4])

    # پیدا کردن بیشترین طول دنباله
    max_len = max(len(seq) for seq in all_data_X)

    # تبدیل به آرایه numpy و پدینگ
    X_padded = pad_sequences(all_data_X)

    return X_padded, all_data_Y

In [None]:
import json

try:
    with open(r"ASLWL\WLASL_v0.3.json", 'r') as f:
        data = json.load(f)
except FileNotFoundError:
    print("Error: The specified file was not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

data2 = []
labels_ = []
for entry in data:
    gloss = entry["gloss"]
    if gloss not in labels_:
        labels_.append(gloss)
    for instance in entry["instances"]:
        video_id = instance["video_id"]
        data2.append([gloss, video_id])
print(data2)
print(len(labels_))
X_data, Y_data = data_to_array2()
for index in range(len(Y_data)):
    for Gloss, ID2 in data2:
        if Y_data[index] == ID2:
            Y_data[index] = Gloss
X_data = np.array(X_data)
Y_data = np.array(Y_data)
print(X_data.shape)
print(Y_data)

Error: The specified file was not found.


NameError: name 'data' is not defined

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

class Model_WLASL:
    def __init__(self):
        self.model = None
    def build_model(self, num_classes=2000):

        inp = Input(shape=(128, 126))  # فریم × ویژگی‌های هر فریم

        x = LSTM(256, return_sequences=True)(inp)
        x = LSTM(128)(x)

        x = Dense(128, activation="relu")(x)
        x = Dropout(0.3)(x)

        out = Dense(num_classes, activation="softmax")(x)

        self.model = Model(inp, out)
        self.model.compile(optimizer="adam", 
                    loss="sparse_categorical_crossentropy",
                    metrics=["accuracy"])
        return
    
    def train(self, X_data, Y_data):
        self.model.train(X_data, Y_data, epochs=30)
        self.model.save('ASL_words_model.h5')

    def predict(self):
        prediction = self.model.predict(input_seq, verbose=0)
        pred_class = np.argmax(prediction)
        return labels[pred_class]
        

modelASL = Model_WLASL()
modelASL.build_model(2000)
modelASL.train(X_data, Y_data)
ASL.train(dataX_seq, dataY_seq)

cap = cv2.VideoCapture(0)
sequence_buffer = []
cooldown_frames = 5  # تعداد فریم‌هایی که نمایش داده نمی‌شود
frame_counter = 0
last_label = 'B'

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(image_rgb)

        features = ASL.extract_hand_feature_vector(results)
        sequence_buffer.append(features)
        if len(sequence_buffer) > 30:
            sequence_buffer.pop(0)

        if len(sequence_buffer) == 30:
            input_seq = np.expand_dims(sequence_buffer, axis=0)
            pred_class = ASL.predict(input_seq)
            current_label = pred_class

            if current_label != last_label:
                frame_counter = cooldown_frames
                last_label = current_label
            if frame_counter > 0:
                frame_counter -= 1
                display_label = None  # نمایش نده
            else:
                display_label = current_label
            cv2.putText(frame, f'{display_label}', (10, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # رسم دست‌ها
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2),
                    mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=1)
                )

        cv2.imshow('ASL Detection (Hands Only)', frame)

        if cv2.getWindowProperty("ASL Detection (Hands Only)", cv2.WND_PROP_VISIBLE) < 1:
            break
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()
