## Imports

In [19]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LayerNormalization, MultiHeadAttention, Dropout, Dense
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from PIL import Image, ImageTk
import cv2
import mediapipe as mp

## Set Paths and Configurations

In [20]:
MODEL_PATH = r'G:\Capstone data\(Enhanced)_models\ViT_fold4.h5'
ALL_SENTENCES_PATH = r'G:\Capstone data\all_Sentences.xlsx'
LABELS_PATH = r'G:\Capstone data\KARSL-502_Labels.xlsx'
BACKGROUND_PATH = r"G:\Capstone data\background.png"

F_AVG = 30  # Number of frames (time steps) expected by model
MIN_IMAGES = 2  # Minimum number of images
MAX_IMAGES = 5  # Maximum number of images

## Model Loading

In [21]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(rate)
        
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim),
        ])
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        x_norm = self.layernorm1(inputs)
        attn_output = self.att(x_norm, x_norm)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = inputs + attn_output
        
        out_norm = self.layernorm2(out1)
        ffn_output = self.ffn(out_norm)
        ffn_output = self.dropout2(ffn_output, training=training)
        return out1 + ffn_output

class PatchEmbedding(tf.keras.layers.Layer):
    def __init__(self, num_patches, embed_dim, **kwargs):
        super(PatchEmbedding, self).__init__(**kwargs)
        self.num_patches = num_patches
        self.embed_dim = embed_dim
        self.projection = Dense(embed_dim)
        self.position_embedding = self.add_weight(
            name="pos_embed",
            shape=(1, num_patches, embed_dim),
            initializer="random_normal"
        )

    def call(self, patch):
        x = self.projection(patch)
        return x + self.position_embedding

    def get_config(self):
        config = super(PatchEmbedding, self).get_config()
        config.update({
            'num_patches': self.num_patches,
            'embed_dim': self.embed_dim
        })
        return config

In [22]:
# Define custom objects to pass to the load_model function
custom_objects = {
    'TransformerBlock': TransformerBlock,
    'PatchEmbedding': PatchEmbedding
}

# Load the model
vit_model = load_model(MODEL_PATH, custom_objects=custom_objects)
print("ViT model loaded successfully.")



ViT model loaded successfully.


## Load Labels and Sentences

In [23]:
labels_df = pd.read_excel(LABELS_PATH)
w2id = dict(zip(labels_df['Sign-Arabic'], labels_df['SignID']))
id2w = {v: k for k, v in w2id.items()}
words = list(w2id.keys())
label_map = {word: idx for idx, word in enumerate(words)}
NUM_CLASSES = len(words)
print(f"Loaded {NUM_CLASSES} labels.")

all_sentences_df = pd.read_excel(ALL_SENTENCES_PATH)
all_sentences_df.columns = all_sentences_df.columns.str.strip()
sentences = all_sentences_df['Sentences'].tolist()
print(f"Loaded {len(sentences)} sentences.")

Loaded 502 labels.
Loaded 350 sentences.


## Preprocessing Functions (Arabic Stemmer and Similarity Functions)

In [24]:
vectorizer = TfidfVectorizer()

def normalize(word):
    for _ in range(3):
        word = re.sub("أ", "ا", word)
        word = re.sub("ي", "ى", word)
        word = re.sub("ؤ", "ء", word)
        word = re.sub("ئ", "ء", word)
        word = re.sub("ة", "ه", word)
        word = re.sub("گ", "ك", word)
    return word

def Def_articles_removal(word):
    articles = ['بال', 'فال', 'وال', 'كال', 'ولل', 'ال', 'ل', 'لي', 'ا', 'فبال', 'لبال', 'وبال']
    for article in articles:
        if word.startswith(article):
            word = word.replace(article, '')
    return word

def prefix_removal(word):
    p1 = ["و", "ف", "ب", "ك", "ل"]
    p2 = ['أل']
    p3 = ['وال', 'فال', 'كال']
    for p in p1 + p2 + p3:
        if word.startswith(p):
            word = word.replace(p, '')
    return word

def suffix_removal(word):
    s1 = ['ي ', 'ك', 'ـه']
    s2 = ["هن", "ها", "هم", "كن"]
    s3 = ["هما"]
    for s in s1 + s2 + s3:
        if word.endswith(s):
            word = word.replace(s, '')
    return word

def Arabic_stemmer(text):
    root = []
    for s in text:
        s1 = normalize(s)
        s2 = Def_articles_removal(s1)
        s3 = prefix_removal(s2)
        s4 = suffix_removal(s3)
        root.append(s4)
    return root

def get_most_similar_sentence(sentences, sign_words):
    input_text = ' '.join(Arabic_stemmer(sign_words))
    normalized_sentences = [' '.join(Arabic_stemmer(sentence.split())) for sentence in sentences]
    combined_sentences = normalized_sentences + [input_text]
    tfidf_matrix = vectorizer.fit_transform(combined_sentences)
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    most_similar_idx = cosine_sim.argmax()
    return sentences[most_similar_idx]

## Keypoint Extraction Functions

In [25]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def mediapipe_detection(image, model):
    """
    This function converts the image to RGB, performs keypoint extraction
    using MediaPipe's Holistic model, and converts the image back to BGR.
    """
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert image to RGB
    image.flags.writeable = False
    results = model.process(image)  # Run keypoint extraction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert image back to BGR
    return image, results

def adjust_landmarks(arr, center):
    """
    This function adjusts landmarks by centering them around a reference point (like the nose or wrist).
    """
    arr_reshaped = arr.reshape(-1, 3)  # Reshape to 2D array
    center_repeated = np.tile(center, (len(arr_reshaped), 1))  # Repeat the center point
    arr_adjusted = arr_reshaped - center_repeated  # Subtract the center
    arr_adjusted = arr_adjusted.reshape(-1)  # Flatten back to 1D array
    return arr_adjusted

def extract_keypoints(results):
    """
    This function extracts keypoints for the pose, left hand, and right hand from the MediaPipe results.
    It flattens the landmarks for each part and adjusts their positions based on a reference point.
    """
    # Extract keypoints for pose, left hand, and right hand
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    # Extract wrist positions and nose for normalization purposes
    nose = pose[:3]  # The first three values represent the nose position
    lh_wrist = lh[:3]  # Left wrist position (first three values)
    rh_wrist = rh[:3]  # Right wrist position (first three values)

    # Adjust landmarks based on wrist/nose position for better consistency across images
    pose_adjusted = adjust_landmarks(pose, nose)
    lh_adjusted = adjust_landmarks(lh, lh_wrist)
    rh_adjusted = adjust_landmarks(rh, rh_wrist)

    return pose_adjusted, lh_adjusted, rh_adjusted

def process_single_image(image_path):
    """
    This function processes a single image by extracting keypoints multiple times to create a sequence of keypoints.
    This is important to simulate multiple frames and give better data to the model.
    """
    keypoints_seq = []  # Initialize list to store keypoint sequences
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        frame = cv2.imread(image_path)  # Read the image
        if frame is None:
            raise ValueError(f"Could not read image {image_path}")  # Raise an error if the image cannot be read
        
        # Process the same image multiple times to get F_AVG frames
        for _ in range(F_AVG):
            image, results = mediapipe_detection(frame.copy(), holistic)  # Detect keypoints in the image
            pose, lh, rh = extract_keypoints(results)  # Extract pose and hand keypoints
            combined = np.concatenate([pose, lh, rh])  # Combine the keypoints from all parts
            keypoints_seq.append(combined)  # Add the keypoints sequence

    return np.array(keypoints_seq)  # Return the keypoints sequence as a numpy array

def predict_single_image(vit_model, labels_df, keypoints_data):
    """
    This function takes the keypoints data, reshapes it, and predicts the sign using the ViT model.
    """
    input_data = keypoints_data.reshape(1, F_AVG, -1)  # Reshape to match model input shape
    predictions = vit_model.predict(input_data)  # Get predictions from the ViT model
    predicted_index = np.argmax(predictions, axis=1)[0]  # Get the index of the highest prediction score
    return labels_df['Sign-Arabic'][predicted_index]  # Return the predicted sign

## Prediction Functions

In [26]:
def process_single_image(image_path):
    keypoints_seq = []
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        frame = cv2.imread(image_path)
        if frame is None:
            raise ValueError(f"Could not read image {image_path}")
        
        # Process the same image multiple times to get F_AVG frames
        for _ in range(F_AVG):
            image, results = mediapipe_detection(frame.copy(), holistic)
            pose, lh, rh = extract_keypoints(results)
            combined = np.concatenate([pose, lh, rh])
            keypoints_seq.append(combined)
    
    return np.array(keypoints_seq)

def predict_single_image(vit_model, labels_df, keypoints_data):
    input_data = keypoints_data.reshape(1, F_AVG, -1)
    predictions = vit_model.predict(input_data)
    predicted_index = np.argmax(predictions, axis=1)[0]
    return labels_df['Sign-Arabic'][predicted_index]

## GUI

In [27]:
class TranslationResultPopup(tk.Toplevel):
    def __init__(self, parent, uploaded_images, predicted_signs, translation):
        super().__init__(parent)
        self.title("Translation Result")  # Window title
        self.geometry("900x700")  # Window size
        self.resizable(False, False)  # Disable resizing
        self.configure(bg='#f0f0f0')  # Background color

        # Main frame with scrollbar for displaying the images and results
        main_frame = tk.Frame(self, bg='#f0f0f0')
        main_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)
        
        # Create a canvas to hold the scrollable content
        canvas = tk.Canvas(main_frame, bg='#f0f0f0')
        scrollbar = ttk.Scrollbar(main_frame, orient="vertical", command=canvas.yview)
        scrollable_frame = tk.Frame(canvas, bg='#f0f0f0')
        
        # Update the scrollable area as content changes
        scrollable_frame.bind(
            "<Configure>",
            lambda e: canvas.configure(
                scrollregion=canvas.bbox("all")
            )
        )
        
        canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
        canvas.configure(yscrollcommand=scrollbar.set)
        
        canvas.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")
        
        # Title label
        title_label = tk.Label(scrollable_frame, text="Translation Result", 
                             font=("Arial", 18, "bold"), bg='#f0f0f0')
        title_label.pack(pady=10)
        
        # Frame to hold uploaded images and predictions
        images_frame = tk.Frame(scrollable_frame, bg='#f0f0f0')
        images_frame.pack(fill=tk.X, pady=10)
        
        # Display each uploaded image with its predicted sign in a grid
        for i, (img_path, sign) in enumerate(zip(uploaded_images, predicted_signs)):
            try:
                # Create a frame for each image-prediction pair
                pair_frame = tk.Frame(images_frame, bg='#f0f0f0', padx=10, pady=10)
                pair_frame.grid(row=i//3, column=i%3, padx=10, pady=10)
                
                # Load, resize, and display the image
                img = Image.open(img_path)
                img.thumbnail((200, 200), Image.LANCZOS)
                photo = ImageTk.PhotoImage(img)
                
                img_label = tk.Label(pair_frame, image=photo, bg='white')
                img_label.image = photo
                img_label.pack()
                
                # Label showing the image number
                img_num = tk.Label(pair_frame, text=f"Image {i+1}", 
                                  font=("Arial", 10), bg='#f0f0f0')
                img_num.pack()
                
                # Label showing the predicted sign
                pred_label = tk.Label(pair_frame, text=f"Predicted: {sign}", 
                                    font=("Arial", 12), bg='#f0f0f0')
                pred_label.pack(pady=5)
                
            except Exception as e:
                # Handle errors if any image fails to load
                print(f"Error loading image {img_path}: {e}")
                error_label = tk.Label(pair_frame, text=f"Image {i+1} not found", 
                                     font=("Arial", 12), bg='#f0f0f0')
                error_label.pack()
        
        # Frame for displaying the full translation
        translation_frame = tk.Frame(scrollable_frame, bg='#f0f0f0')
        translation_frame.pack(fill=tk.X, pady=20)
        
        tk.Label(translation_frame, text="Full Translation:", 
                font=("Arial", 14, "bold"), bg='#f0f0f0').pack(anchor='w')
        
        # Display the full translation in a non-editable text box
        translation_text = tk.Text(translation_frame, height=5, width=80, 
                                 font=("Arial", 12), wrap=tk.WORD, padx=10, pady=10)
        translation_text.insert(tk.END, translation)
        translation_text.config(state=tk.DISABLED)
        translation_text.pack(fill=tk.X, pady=10)
        
        # Close button for the popup
        close_btn = tk.Button(scrollable_frame, text="Close", font=("Arial", 12),
                            command=self.destroy)
        close_btn.pack(pady=10)

class SignLanguageApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Sign Language Prediction")  # Window title
        self.geometry("900x600")  # Window size
        self.resizable(False, False)  # Disable resizing
        self.configure(bg='#f0f0f0')  # Background color
        self.uploaded_images = []  # List to store uploaded images
        self.num_images = 0  # Number of images to upload
        self.current_image = 0  # Track the current image number

        # Load background image if it exists
        if os.path.exists(BACKGROUND_PATH):
            bg_image = Image.open(BACKGROUND_PATH)
            bg_image = bg_image.resize((900, 600), Image.LANCZOS)
            self.bg_photo = ImageTk.PhotoImage(bg_image)
            bg_label = tk.Label(self, image=self.bg_photo)
            bg_label.place(x=0, y=0, relwidth=1, relheight=1)
        else:
            print("Background image not found at", BACKGROUND_PATH)

        # Button styling
        button_bg = '#f0f0f0'
        button_fg = 'black'
        button_font = ("Arial", 14)
        
        # Button to start live camera caption (coming soon)
        live_btn = tk.Button(
            self,
            text="Live Camera Caption",
            font=button_font,
            bg=button_bg,
            fg=button_fg,
            activebackground='gray',
            activeforeground='black',
            command=self.live_camera
        )
        live_btn.place(x=450, y=420, anchor="n")

        # Button to upload images
        upload_btn = tk.Button(
            self,
            text="Upload Images",
            font=button_font,
            bg=button_bg,
            fg=button_fg,
            activebackground='gray',
            activeforeground='black',
            command=self.upload_images
        )
        upload_btn.place(x=450, y=470, anchor="n")

    def live_camera(self):
        # Show message when live camera feature is clicked (coming soon)
        messagebox.showinfo("Info", "This feature coming soon! Stay tuned😊")

    def upload_images(self):
        # Create a new window for uploading images
        upload_window = tk.Toplevel(self)
        upload_window.title("Upload Images")
        upload_window.geometry("600x400")
        upload_window.grab_set()  # Make this window modal (blocks interaction with the main window)
        
        # Guide message for number of images to upload
        self.status_var = tk.StringVar()
        self.status_var.set(f"Select number of images to upload ({MIN_IMAGES}-{MAX_IMAGES})")
        status_label = tk.Label(upload_window, textvariable=self.status_var, 
                               font=("Arial", 12), wraplength=500)
        status_label.pack(pady=20)

        # Variable for number of images to upload
        image_var = tk.IntVar(upload_window)
        image_var.set(MIN_IMAGES)

        # Label and combo box to select number of images
        tk.Label(upload_window, text=f"Select number of images ({MIN_IMAGES}-{MAX_IMAGES}):", 
                font=("Arial", 12)).pack(pady=10)
        image_combo = ttk.Combobox(upload_window, textvariable=image_var, 
                                 values=list(range(MIN_IMAGES, MAX_IMAGES+1)), state="readonly")
        image_combo.pack(pady=5)

        # Button to start uploading the images
        start_btn = tk.Button(upload_window, text="Start Upload", font=("Arial", 12),
                            command=lambda: self.start_image_selection(upload_window, image_var.get()))
        start_btn.pack(pady=20)
        
        # Cancel button to close the upload window
        cancel_btn = tk.Button(upload_window, text="Cancel", font=("Arial", 12),
                             command=upload_window.destroy)
        cancel_btn.pack(pady=10)

    def start_image_selection(self, parent_window, num_images):
        # Start the process of selecting images
        self.num_images = num_images
        self.current_image = 1
        self.uploaded_images = []  # Clear any previously uploaded images
        self.status_var.set(f"Select image {self.current_image} of {self.num_images}")
        self.select_next_image(parent_window)

    def select_next_image(self, parent_window):
        # Open file dialog to select an image
        file_path = filedialog.askopenfilename(
            title=f"Select image {self.current_image}/{self.num_images}",
            filetypes=[("Image Files", "*.png *.jpg *.jpeg")]
        )
        
        if file_path:
            # Append the selected image to the list
            self.uploaded_images.append(file_path)
            filename = os.path.basename(file_path)
            self.status_var.set(
                f"Selected image {self.current_image}: {filename}\n"
                f"Select image {self.current_image+1} of {self.num_images}"
            )
            
            self.current_image += 1
            
            if self.current_image <= self.num_images:
                # Continue selecting the next image
                parent_window.after(100, lambda: self.select_next_image(parent_window))
            else:
                # Process images when all have been selected
                ok_btn = tk.Button(parent_window, text="Process Images", font=("Arial", 12),
                                  command=lambda: self.process_and_show_result(parent_window))
                ok_btn.pack(pady=20)
        else:
            if self.current_image == 1:
                # Close the window if the first image is canceled
                parent_window.destroy()
            else:
                self.status_var.set(
                    f"Selection canceled for image {self.current_image}\n"
                    f"Please select image {self.current_image} of {self.num_images}"
                )

    def process_and_show_result(self, parent_window):
        # Check if enough images are uploaded
        if not self.uploaded_images or len(self.uploaded_images) < MIN_IMAGES or len(self.uploaded_images) > MAX_IMAGES:
            messagebox.showerror("Error", 
                                f"Please upload between {MIN_IMAGES} and {MAX_IMAGES} images.")
            return

        try:
            predicted_signs = []
            
            # Process each uploaded image
            for img_path in self.uploaded_images:
                # Get keypoints for this image
                keypoints_seq = process_single_image(img_path)
                
                # Predict sign for this image
                predicted_sign = predict_single_image(vit_model, labels_df, keypoints_seq)
                predicted_signs.append(predicted_sign)
            
            # Get the most similar sentence for the predicted words
            most_similar = get_most_similar_sentence(sentences, predicted_signs)
            
            # Close the upload window
            parent_window.destroy()
            
            # Show the result in a new popup window
            TranslationResultPopup(self, self.uploaded_images, predicted_signs, most_similar)
            
        except Exception as e:
            # Show error message if something goes wrong
            messagebox.showerror("Error", f"Failed to process images: {str(e)}")

# Main code to run the application
if __name__ == "__main__":
    app = SignLanguageApp()
    app.mainloop()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 361ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
