# Static Model Test
This notebook tests the defined models using a gradio inteface where one can simply upload an image or video, choose a model and then get a classification of the perfromed exercise.

In [51]:
import gradio as gr
import os
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2

In [52]:
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

MODEL_PATH = "..\skeleton_lstm_multiclass3.h5"
# bench_press: 0, bulgarian_squat: 1, lat_machine: 2, pull_up: 3, push_up: 4, split_squat: 5
CLASSES = ["pull_up", "push_up", "split_squat"]

KEYPOINT_DIM = 132  # 33 landmarks with x,y,z,visibility

# ——— load trained model ———
model = tf.keras.models.load_model(MODEL_PATH)

  MODEL_PATH = "..\skeleton_lstm_multiclass3.h5"


In [53]:
# ——— init Mediapipe Pose & etc. ———
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
mp_styles = mp.solutions.drawing_styles

pose = mp_pose.Pose(
    static_image_mode=True,
    model_complexity=1,
    enable_segmentation=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

In [54]:
# ——— helper to extract keypoints array from Mediapipe results ———
def extract_keypoints_from_results(results):
    # build list in the same order as training: (lm0.x, lm0.y, lm0.z, lm0.v, lm1.x, …)
    kpts = []
    for lm in results.pose_landmarks.landmark:
        kpts.extend([lm.x, lm.y, lm.z, lm.visibility])
    return np.array(kpts, dtype=np.float32)  # shape = (132,)

In [55]:
# ORIGINAL
'''def process_video(video):
    # Initialize video capture
    cap = cv2.VideoCapture(video)
    sequence = []
    frame_count = 0
    max_frames = 30  # We want 30 frames for the sequence
    
    while cap.isOpened() and frame_count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
            
        # Convert frame to RGB for MediaPipe
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Get pose landmarks
        results = pose.process(frame_rgb)
        
        if results.pose_landmarks:
            # Extract keypoints
            keypoints = extract_keypoints_from_results(results)
            sequence.append(keypoints)
            frame_count += 1
    
    cap.release()
    
    # If we don't have enough frames, pad the sequence
    if len(sequence) < max_frames:
        # Pad with the last frame's keypoints
        last_frame = sequence[-1] if sequence else np.zeros(KEYPOINT_DIM)
        while len(sequence) < max_frames:
            sequence.append(last_frame)
    
    # Convert sequence to numpy array and reshape for model input
    sequence = np.array(sequence)
    sequence = sequence.reshape(1, max_frames, KEYPOINT_DIM)
    
    # Get model predictions
    predictions = model.predict(sequence, verbose=0)[0]
    
    # Get top 3 predictions
    top_3_idx = np.argsort(predictions)[-3:][::-1]
    top_3_classes = [CLASSES[i] for i in top_3_idx]
    top_3_confidences = [float(predictions[i]) for i in top_3_idx]
    
    # Create prediction text
    prediction_text = "Top 3 Predictions:\n"
    for i in range(3):
        prediction_text += f"{i+1}. {top_3_classes[i]}: {top_3_confidences[i]:.2%}\n"
    
    # Create a visualization of the last processed frame
    if len(sequence) > 0:
        last_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Draw pose landmarks
        mp_drawing.draw_landmarks(
            last_frame,
            results.pose_landmarks,
            mp_pose.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
        )
        
        # Add prediction text to frame
        y_position = 30
        for i in range(3):
            text = f"{top_3_classes[i]}: {top_3_confidences[i]:.2%}"
            cv2.putText(last_frame, text, (10, y_position), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            y_position += 40
    else:
        last_frame = np.zeros((480, 640, 3), dtype=np.uint8)
        cv2.putText(last_frame, "No pose detected in video", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    return prediction_text, last_frame'''

'def process_video(video):\n    # Initialize video capture\n    cap = cv2.VideoCapture(video)\n    sequence = []\n    frame_count = 0\n    max_frames = 30  # We want 30 frames for the sequence\n\n    while cap.isOpened() and frame_count < max_frames:\n        ret, frame = cap.read()\n        if not ret:\n            break\n\n        # Convert frame to RGB for MediaPipe\n        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n\n        # Get pose landmarks\n        results = pose.process(frame_rgb)\n\n        if results.pose_landmarks:\n            # Extract keypoints\n            keypoints = extract_keypoints_from_results(results)\n            sequence.append(keypoints)\n            frame_count += 1\n\n    cap.release()\n\n    # If we don\'t have enough frames, pad the sequence\n    if len(sequence) < max_frames:\n        # Pad with the last frame\'s keypoints\n        last_frame = sequence[-1] if sequence else np.zeros(KEYPOINT_DIM)\n        while len(sequence) < max_frames:\n    

In [56]:
# FLIPPED
def process_video_flipped(video):
    # Initialize video capture
    cap = cv2.VideoCapture(video)
    sequence_flipped = []  # Only store flipped keypoints
    frame_count = 0
    max_frames = 30
    
    while cap.isOpened() and frame_count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
            
        # Convert frame to RGB for MediaPipe
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Get pose landmarks
        results = pose.process(frame_rgb)
        
        if results.pose_landmarks:
            # Extract keypoints
            keypoints = extract_keypoints_from_results(results)
            
            # Create flipped keypoints by negating x coordinates
            keypoints_flipped = keypoints.copy()
            for i in range(0, len(keypoints_flipped), 4):  # Step by 4 because each point has x,y,z,visibility
                keypoints_flipped[i] = 1-keypoints_flipped[i]  # Flip x coordinate
            
            sequence_flipped.append(keypoints_flipped)
            frame_count += 1
    
    cap.release()
    
    # Pad sequence if needed
    if len(sequence_flipped) < max_frames:
        last_frame = sequence_flipped[-1] if sequence_flipped else np.zeros(KEYPOINT_DIM)
        while len(sequence_flipped) < max_frames:
            sequence_flipped.append(last_frame)
    
    # Convert sequence to numpy array and reshape for model input
    sequence_flipped = np.array(sequence_flipped).reshape(1, max_frames, KEYPOINT_DIM)
    
    # Get model predictions
    predictions = model.predict(sequence_flipped, verbose=0)[0]
    
    # Get top 3 predictions
    top_3_idx = np.argsort(predictions)[-3:][::-1]
    top_3_classes = [CLASSES[i] for i in top_3_idx]
    top_3_confidences = [float(predictions[i]) for i in top_3_idx]
    
    # Create prediction text
    prediction_text = "Top 3 Predictions (Flipped):\n"
    for i in range(3):
        prediction_text += f"{i+1}. {top_3_classes[i]}: {top_3_confidences[i]:.2%}\n"
    
    # Create a visualization of the flipped pose
    if len(sequence_flipped) > 0:
        # Create a blank image
        last_frame = np.zeros((480, 640, 3), dtype=np.uint8)
        
        # Create a new NormalizedLandmarkList for the flipped pose
        flipped_landmarks = landmark_pb2.NormalizedLandmarkList()
        
        # Get the last keypoints from the sequence
        last_keypoints = sequence_flipped[0, -1]  # Shape: (132,)
        
        # Convert keypoints back to landmark format
        for i in range(0, len(last_keypoints), 4):
            landmark = flipped_landmarks.landmark.add()
            landmark.x = last_keypoints[i]
            landmark.y = last_keypoints[i + 1]
            landmark.z = last_keypoints[i + 2]
            landmark.visibility = last_keypoints[i + 3]
        
        # Draw flipped pose landmarks
        mp_drawing.draw_landmarks(
            last_frame,
            flipped_landmarks,
            mp_pose.POSE_CONNECTIONS
        )
        
        # Add prediction text to frame
        y_position = 30
        for i in range(3):
            text = f"{top_3_classes[i]}: {top_3_confidences[i]:.2%}"
            cv2.putText(last_frame, text, (10, y_position), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            y_position += 40
    else:
        last_frame = np.zeros((480, 640, 3), dtype=np.uint8)
        cv2.putText(last_frame, "No pose detected in video", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    return prediction_text, last_frame

# Create Gradio interface for flipped version
demo_flipped = gr.Interface(
    fn=process_video_flipped,
    inputs=gr.Video(),
    outputs=[
        gr.Textbox(label="Predictions (Flipped)", lines=4),
        gr.Image(label="Flipped Pose Visualization")
    ],
    title="Exercise Classification (Flipped)",
    description="Upload a video to classify the exercise using flipped pose keypoints."
)

# Launch the interface
demo_flipped.launch()

* Running on local URL:  http://127.0.0.1:7884
* To create a public link, set `share=True` in `launch()`.




In [57]:
# FLIPPED + ORIGINAL
def process_video(video):
    # Initialize video capture
    cap = cv2.VideoCapture(video)
    sequence = []
    sequence_flipped = []  # New sequence for flipped keypoints
    frame_count = 0
    max_frames = 30
    
    while cap.isOpened() and frame_count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
            
        # Convert frame to RGB for MediaPipe
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Get pose landmarks
        results = pose.process(frame_rgb)
        
        if results.pose_landmarks:
            # Extract keypoints
            keypoints = extract_keypoints_from_results(results)
            
            # Create flipped keypoints by negating x coordinates
            keypoints_flipped = keypoints.copy()
            for i in range(0, len(keypoints_flipped), 4):  # Step by 4 because each point has x,y,z,visibility
                keypoints_flipped[i] = -keypoints_flipped[i]  # Flip x coordinate
            
            sequence.append(keypoints)
            sequence_flipped.append(keypoints_flipped)
            frame_count += 1
    
    cap.release()
    
    # Pad sequences if needed
    if len(sequence) < max_frames:
        last_frame = sequence[-1] if sequence else np.zeros(KEYPOINT_DIM)
        last_frame_flipped = sequence_flipped[-1] if sequence_flipped else np.zeros(KEYPOINT_DIM)
        while len(sequence) < max_frames:
            sequence.append(last_frame)
            sequence_flipped.append(last_frame_flipped)
    
    # Convert sequences to numpy arrays and reshape for model input
    sequence = np.array(sequence).reshape(1, max_frames, KEYPOINT_DIM)
    sequence_flipped = np.array(sequence_flipped).reshape(1, max_frames, KEYPOINT_DIM)
    
    # Get model predictions for both sequences
    predictions = model.predict(sequence, verbose=0)[0]
    predictions_flipped = model.predict(sequence_flipped, verbose=0)[0]
    
    # Combine predictions (take maximum confidence for each class)
    combined_predictions = np.maximum(predictions, predictions_flipped)
    
    # Get top 3 predictions
    top_3_idx = np.argsort(combined_predictions)[-3:][::-1]
    top_3_classes = [CLASSES[i] for i in top_3_idx]
    top_3_confidences = [float(combined_predictions[i]) for i in top_3_idx]
    
    # Create prediction text
    prediction_text = "Top 3 Predictions:\n"
    for i in range(3):
        prediction_text += f"{i+1}. {top_3_classes[i]}: {top_3_confidences[i]:.2%}\n"
    
    # Create a visualization of the last processed frame
    if len(sequence) > 0:
        last_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Draw pose landmarks
        mp_drawing.draw_landmarks(
            last_frame,
            results.pose_landmarks,
            mp_pose.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
        )
        
        # Add prediction text to frame
        y_position = 30
        for i in range(3):
            text = f"{top_3_classes[i]}: {top_3_confidences[i]:.2%}"
            cv2.putText(last_frame, text, (10, y_position), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            y_position += 40
    else:
        last_frame = np.zeros((480, 640, 3), dtype=np.uint8)
        cv2.putText(last_frame, "No pose detected in video", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    return prediction_text, last_frame

In [58]:
# Create Gradio interface
demo = gr.Interface(
    fn=process_video,
    inputs=gr.Video(),  # Changed from Image to Video
    outputs=[
        gr.Textbox(label="Predictions", lines=4),
        gr.Image(label="Last Frame with Pose")
    ],
    title="Exercise Classification (Video)",
    description="Upload a video to classify the exercise being performed."
)

# Launch the interface
demo.launch()

* Running on local URL:  http://127.0.0.1:7885
* To create a public link, set `share=True` in `launch()`.


