In [None]:

import os
import time
import numpy as np
import cv2 as cv
import mediapipe as mp
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import models, layers
from sklearn.metrics import confusion_matrix, classification_report

# Constants
DELAY = 50
INPUT_TYPES = ['palm', 'fist', 'thumbsup', 'gun', 'call']
MIN_INPUT_COUNT = 2000 + DELAY
INPUT_PATH = 'Inputs/'

# Make directories
def make_dirs():
    if not os.path.isdir(INPUT_PATH):
        os.mkdir(INPUT_PATH, mode=511)
    os.chdir(INPUT_PATH)
    for dirs in INPUT_TYPES:
        if not os.path.isdir(dirs):
            os.mkdir(dirs, mode=511)
    os.chdir('..')

# Load data
def load_data():
    data = []
    for types in INPUT_TYPES:
        temp = INPUT_PATH + types + '/'
        l = []
        for file in os.listdir(temp):
            if file.endswith('.jpg'):
                image_matrix = plt.imread(temp + file)
                l.append(image_matrix)
        data.append(l)
    data = np.array(data)
    return data

# Preprocess data
def preprocess_data(data):
    X, Y = [], []
    i = 0
    for class_ in data:
        for image in class_:
            X.append(image)
            Y.append(i)
        i += 1
    X = np.array(X)
    Y = np.array(Y)
    X = X / 255.0
    return X, Y

# Train CNN model
def train_model(X_train, Y_train):
    cnn = models.Sequential([
        layers.Conv2D(input_shape = X_train.shape[1:], filters = 32, kernel_size = (3,3), strides = (1, 1), padding = 'same', activation = 'relu'),
        layers.MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'same'),
        layers.Dropout(0.2),
        layers.Conv2D(filters = 64, kernel_size = (3,3), strides = (1, 1), padding = 'same', activation = 'relu'),
        layers.MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'same'),
        layers.Dropout(0.2),
        layers.Flatten(),
        layers.Dense(units = 512, activation = 'relu'),
        layers.Dropout(0.2),
        layers.Dense(units = len(INPUT_TYPES), activation = 'softmax')
    ])
    cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    start_time = time.time()
    cnn.fit(X_train, Y_train, epochs=10)
    end_time = time.time()
    print("total time in seconds", (end_time - start_time))
    return cnn

# Predict function
def predict(cnn, img):
    class_ = np.argmax(cnn.predict(img))
    return INPUT_TYPES[class_]

# Initialize Mediapipe
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()

# Capture from webcam
webcam = 0
capture = cv.VideoCapture(webcam)
fps = int(capture.get(cv.CAP_PROP_FPS))
print("fps is " + str(fps))
_, frame = capture.read()
height, width, _ = frame.shape

# Real-time prediction
while capture.isOpened():
    if cv.waitKey(1) & 0xFF == 13:
        break
    black = np.zeros(shape=frame.shape)
    _, frame = capture.read()
    frame_rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
    result = holistic.process(frame_rgb)
    try:
        hand_landmarks = result.right_hand_landmarks.landmark
        if hand_landmarks:
            x_max, y_max = 0, 0
            x_min, y_min = width, height
            for lm in hand_landmarks:
                x, y = int(lm.x * width), int(lm.y * height)
                x_max, y_max = max(x, x_max), max(y, y_max)
                x_min, y_min = min(x, x_min), min(y, y_min)
            frame_bgr = cv.cvtColor(frame_rgb, cv.COLOR_RGB2BGR)
            mp_drawing.draw_landmarks(frame_bgr, result.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
            cv.rectangle(frame_bgr, (x_min - 25, y_min - 25), (x_max + 25, y_max + 25), (0, 255, 0), 2)
            result1 = frame_bgr
            mp_drawing.draw_landmarks(black, result.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
            cropped = black[y_min - 23: y_max + 23, x_min - 23: x_max + 23]
            resized = cv.resize(cropped, (96, 96))
            result2 = cv.flip(resized, 1)
            img_mat = np.array([result2])
            class_ = predict(cnn, img_mat)
            cv.putText(result1, str(class_), (100, 100), cv.FONT_HERSHEY_PLAIN, 2, (255,0,0), 1)
            cv.imshow("Frame2", result2)
    except:
        result1 = frame
        pass
    mirror1 = cv.flip(result1, 1)
    cv.imshow('frame1', mirror1)
capture.release()
cv.destroyAllWindows()
