dependencies

In [None]:
# Core ML libraries
pip install tensorflow==2.14.0        # Latest stable TensorFlow (CPU + GPU support)
pip install keras                     # Optional, if you want standalone Keras

# Computer Vision
pip install opencv-python             # OpenCV for image/video processing
pip install mediapipe                 # MediaPipe for pose/hand/face landmarks

# Data science utilities
pip install scikit-learn matplotlib numpy pandas

# Optional for GPU acceleration
pip install nvidia-pip               # If you need CUDA/cuDNN support, check your GPU drivers


1. Import and Install Dependencies

In [59]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [60]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [61]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [71]:
def draw_landmarks(image, results):
    # Draw face mesh (use tessellation for full mesh OR contours for outline only)
    mp_drawing.draw_landmarks(
        image,
        results.face_landmarks,
        mp.solutions.face_mesh.FACEMESH_TESSELATION  # ✅ fixed
    )
    
    # Draw pose connections
    mp_drawing.draw_landmarks(
        image,
        results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS
    )
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(
        image,
        results.left_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS
    )
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(
        image,
        results.right_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS
    )


In [72]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(
        image, results.face_landmarks,
        mp.solutions.face_mesh.FACEMESH_TESSELATION,  # ✅ correct
        mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
        mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
    )
    
    # Draw pose connections
    mp_drawing.draw_landmarks(
        image, results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS,  # ✅ this exists
        mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
        mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
    )
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(
        image, results.left_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
        mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
    )
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(
        image, results.right_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
        mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
    )


In [73]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>


In [74]:
draw_landmarks(frame, results)

In [76]:
#plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

In [78]:
pose = []
for res in results.pose_landmarks.landmark:
    test = np.array([res.x, res.y, res.z, res.visibility])
    pose.append(test)

In [79]:
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

In [81]:
face = (
    np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten()
    if results.face_landmarks 
    else np.zeros(1404)
)


In [82]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [83]:
result_test = extract_keypoints(results)

In [84]:
result_test

array([ 0.53648537,  0.63047439, -0.99100876, ...,  0.        ,
        0.        ,  0.        ])

In [85]:
np.save('0', result_test)
np.load('0.npy')

array([ 0.53648537,  0.63047439, -0.99100876, ...,  0.        ,
        0.        ,  0.        ])

In [86]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Actions that we try to detect
actions = np.array(['hello', 'thanks', 'iloveyou'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
start_folder = 30

In [88]:
import os
import numpy as np

# Path for exported data
DATA_PATH = os.path.join('MP_DATA') 

# Actions you want to detect
actions = np.array(['hello', 'thank you', 'okay', 'emergency', 'victory'])

# Number of sequences (videos per action)
no_sequences = 10

# Number of frames per sequence
sequence_length = 20

# Create main folder if not exists
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

# Create a folder for each action and each sequence
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

print("✅ Folder structure created successfully!")


✅ Folder structure created successfully!


In [None]:
cap = cv2.VideoCapture(0)

# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, 
                          min_tracking_confidence=0.5) as holistic:
    
    # Loop through actions
    for action in actions:
        # Loop through sequences (videos)
        for sequence in range(start_folder, start_folder + no_sequences):
            # Loop through frames per sequence
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)

                # Draw landmarks
                draw_styled_landmarks(image, results)
                
                # Show messages
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120, 200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, f'Collecting frames for {action} Video Number {sequence}', 
                               (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 
                               (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(500)  # wait 0.5 sec before starting
                else: 
                    cv2.putText(image, f'Collecting frames for {action} Video Number {sequence}', 
                               (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 
                               (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)
                
                # Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
cap.release()
cv2.destroyAllWindows()


In [None]:
cap.release()
cv2.destroyAllWindows()

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
y_test.shape

train model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])
model.summary()

_________________________________________________________

8. Make Predictions

In [None]:
res = model.predict(X_test)
actions[np.argmax(res[4])]
#'hello'
actions[np.argmax(y_test[4])]
#'hello'

SAVE WEIGHT

In [None]:
model.save('action.h5')

model.load_weights('action.h5')

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)

In [None]:
from scipy import stats

In [None]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame
plt.figure(figsize=(18,18))
plt.imshow(prob_viz(res, actions, image, colors))

In [None]:
import cv2
import numpy as np
import mediapipe as mp

# -------------------------------
# 1. New detection variables
# -------------------------------
sequence = []       # Stores keypoints for 30 frames
sentence = []       # Stores recognized actions
predictions = []    # Stores prediction indices
threshold = 0.5     # Confidence threshold for action recognition

# -------------------------------
# 2. Start video capture
# -------------------------------
cap = cv2.VideoCapture(0)

# Initialize Mediapipe Holistic model
with mp.solutions.holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as holistic:

    while cap.isOpened():
        # Read frame from camera
        ret, frame = cap.read()
        if not ret:
            break

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)  # Debug: print results to see landmarks

        # Draw landmarks on frame
        draw_styled_landmarks(image, results)

        # -------------------------------
        # 3. Prediction logic
        # -------------------------------
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]  # Keep only last 30 frames

        if len(sequence) == 30:
            # Make prediction on sequence
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            action_index = np.argmax(res)
            print(actions[action_index])
            predictions.append(action_index)

            # -------------------------------
            # 4. Sentence logic
            # -------------------------------
            if np.unique(predictions[-10:])[0] == action_index:
                if res[action_index] > threshold:
                    if len(sentence) > 0:
                        if actions[action_index] != sentence[-1]:
                            sentence.append(actions[action_index])
                    else:
                        sentence.append(actions[action_index])

            # Keep last 5 actions only
            if len(sentence) > 5:
                sentence = sentence[-5:]

            # Visualize probabilities
            image = prob_viz(res, actions, image, colors)

        # -------------------------------
        # 5. Display sentence on screen
        # -------------------------------
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show the frame
        cv2.imshow('OpenCV Feed', image)

        # Break loop on 'q' key
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release resources
cap.release()
cv2.destroyAllWindows()
