In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
import os
import mediapipe as mp
import pickle

from collections import deque, Counter
from tensorflow.keras.models import load_model

2022-04-13 16:40:42.153765: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Metrics

from: https://www.mdpi.com/2414-4088/5/9/55
section 4.3.2 repetition counting  
*OBO is
above 99%, denoting that almost all the test samples are within +-1 of groundtruth.*

In [2]:
def OBO(y_pred: int, y_true: int) -> int:
    return int(y_pred >= y_true-1 and y_pred <= y_true+1)

In [3]:
OBO(9,10)

1

## Custom Objects for ViViT

In [33]:
from tensorflow.keras import layers

In [34]:
class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        
    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )
        self.positions = tf.range(start=0, limit=num_tokens, delta=1)
        
    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        encoded_positions = self.position_embedding(self.positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
        })
        return config

In [35]:
class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.patch_size = patch_size
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding='VALID'
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))
        
    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'patch_size': self.patch_size,
        })
        return config

## Config

In [70]:
COUNTING_ORDER = {
    'pushup': {
        'cur_class': 'pushup-up',
        'prev_class': 'pushup-down'
    },
    'squat': {
        'cur_class': 'squat-up',
        'prev_class': 'squat-down'
    },
    'jumping-jack': {
        'cur_class': 'jumping-jack-down',
        'prev_class': 'jumping-jack-up'
    },
    'leg-raise': {
        'cur_class': 'leg-raise-down',
        'prev_class': 'leg-raise-up'
    },
    'half-burpee': {
        'cur_class': 'half-burpee-out',
        'prev_class': 'half-burpee-in'
    },
}

In [113]:
# model and counting
# EXERCISE = 'pushup'
# CUR_CLASS = 'up'
# PREV_CLASS = 'down'

MODEL_TYPE = 'image_single'
POST = 'hard_vote'
WINDOW_SIZE = 15 # window size for hard voting
BEST = False
ROUND_MAPPING = {
    'pushup': 6,
    'squat': 4,
    'jumping-jack': 3,
    'leg-raise': 2,
    'half-burpee': 1,
}
SEQ_LEN = 8

# input videos
DS_BASE_DIR = 'evaluation_videos/EzFit_dataset'
VID_DIR = os.path.join(DS_BASE_DIR, 'videos')
OUT_DIR = os.path.join(DS_BASE_DIR, 'outputs')
GT_CSV = os.path.join(DS_BASE_DIR, 'labels/label_v1.csv')

if MODEL_TYPE == 'ViViT':
    custom_objects = {"TubeletEmbedding": TubeletEmbedding, 
                      "PositionalEncoder": PositionalEncoder}
else:
    custom_objects = {}

## Run inference

In [114]:
mp_pose = mp.solutions.pose
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [115]:
df = pd.read_csv(GT_CSV)
df

Unnamed: 0,file_name,rep,exercise
0,half-burpee_1_1.mp4,11,half-burpee
1,half-burpee_1_2.mp4,9,half-burpee
2,half-burpee_2_1.mp4,3,half-burpee
3,half-burpee_2_2.mp4,4,half-burpee
4,half-burpee_2_3.mp4,3,half-burpee
...,...,...,...
63,squat_3_1.mp4,8,squat
64,squat_3_2.mp4,6,squat
65,squat_3_3.mp4,7,squat
66,squat_3_4.mp4,5,squat


In [116]:
cur_ex = None
prev_ex = None
all_obos = []
all_reps = []
for idx, row in df.iterrows():
    # 0 -> file_name, 1 -> rep, so on...
    file_name = row[0]
    rep_gt = row[1]
    exercise = row[2]
    cur_ex = exercise
    rep_count = 0
    # if exercise changes, reload model and le to corresponded exercise
    if cur_ex != prev_ex:
        if BEST:
            model_path = f'final_models/h5/{exercise}_{MODEL_TYPE}.h5'
            le_path = f'final_models/le/{exercise}_{MODEL_TYPE}_le.pickle'
        else:
            model_path = f'saved_models/round_{ROUND_MAPPING[exercise]}/{exercise}/{exercise}_{MODEL_TYPE}.h5'
            if MODEL_TYPE == 'kps_single':
                le_path = f'saved_pickles/round_{ROUND_MAPPING[exercise]}/{exercise}/{exercise}_{MODEL_TYPE}_lb.pickle'
            else:
                le_path = f'saved_pickles/round_{ROUND_MAPPING[exercise]}/{exercise}/{exercise}_{MODEL_TYPE}_le.pickle'
        # load model and le
        loaded_model = load_model(model_path, custom_objects=custom_objects)
        with open(le_path, 'rb') as file:
            loaded_le = pickle.load(file)
    
    # start loading video and count
    if 'kps' in MODEL_TYPE: # kps-based models
        kps_seq = deque(maxlen=SEQ_LEN)
        current_stage = None
        previous_stage = None
        if POST == 'hard_vote':
            preds_window = deque(maxlen=WINDOW_SIZE)
        cap = cv2.VideoCapture(os.path.join(VID_DIR, exercise, file_name))
        with mp_pose.Pose(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as pose:
            while cap.isOpened():
                success, frame = cap.read()
                if not success:
                    break

                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = pose.process(image)

                # append face&body coordinates for each frame to the csv file to create dataset to train the model
                try:
                    # extract pose
                    pose_coor = results.pose_landmarks.landmark
                    pose_row = np.array([[landmark.x, landmark.y, landmark.visibility] for landmark in pose_coor]).flatten()
                    # append new coor to the sequence array
                    
                    if MODEL_TYPE == 'kps_single':
                        y_hat = loaded_model.predict(tf.expand_dims(pose_row, axis=0))[0]
                        y_idx = np.argmax(y_hat)
                        if POST == 'hard_vote':
                            # do hard voting
                            preds_window.append(y_idx)
                            pred_count = Counter(preds_window)
                            # most_common return .items() format
                            # ex. [(0,5), (1,3)]
                            voted_pred = pred_count.most_common(1)[0][0]
                            class_name = loaded_le.classes_[voted_pred]
                        else:
                            class_name = loaded_le.classes_[y_idx]
                        
                    else: # kps_stacked and kps_seq
                        kps_seq.append(pose_row)
                        if len(kps_seq) == SEQ_LEN:
                            if MODEL_TYPE == 'kps_stacked':
                                seq_arr = np.array(kps_seq).flatten()
                            elif MODEL_TYPE == 'kps_seq':
                                seq_arr = np.array(kps_seq)
                            y_hat = loaded_model.predict(tf.expand_dims(seq_arr, axis=0))[0]
                            y_idx = np.argmax(y_hat)

                            if POST == 'hard_vote':
                                # do hard voting
                                preds_window.append(y_idx)
                                pred_count = Counter(preds_window)
                                # most_common return .items() format
                                # ex. [(0,5), (1,3)]
                                voted_pred = pred_count.most_common(1)[0][0]
                                class_name = loaded_le.classes_[voted_pred]
                            else:
                                class_name = loaded_le.classes_[y_idx]
                        else:
                            class_name = "None"

                    # count the rep logic
                    current_stage = class_name

                    if current_stage == COUNTING_ORDER[exercise]['cur_class'] and previous_stage == COUNTING_ORDER[exercise]['prev_class']:
                        rep_count += 1

                    previous_stage = current_stage  
                except Exception as e:
                    print(f'[INFO] error when {exercise}:', e)
            # after 1 vid end
            cap.release()
            
    else: # video_sequence, ViViT, image_single, Swin
        vid_seq = deque(maxlen=SEQ_LEN)
        current_stage = None
        previous_stage = None
        if POST == 'hard_vote':
            preds_window = deque(maxlen=WINDOW_SIZE)
        cap = cv2.VideoCapture(os.path.join(VID_DIR, exercise, file_name))
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break

            image_input = cv2.resize(frame, (120, 120))

            if MODEL_TYPE in ['image_single', 'Swin']:
                y_hat = loaded_model.predict(tf.expand_dims(image_input, axis=0))[0]
                y_idx = np.argmax(y_hat)

                if POST == 'hard_vote':
                    # do hard voting
                    preds_window.append(y_idx)
                    pred_count = Counter(preds_window)
                    # most_common return .items() format
                    # ex. [(0,5), (1,3)]
                    voted_pred = pred_count.most_common(1)[0][0]
                    class_name = loaded_le.classes_[voted_pred]
                else:
                    class_name = loaded_le.classes_[y_idx]
                    
            else: # video_seqeunce, ViViT
                vid_seq.append(image_input)

                if len(vid_seq) == SEQ_LEN:
                    input_seq = np.array(vid_seq) / 255.0
                    y_hat = loaded_model.predict(tf.expand_dims(input_seq, axis=0))[0]
                    y_idx = np.argmax(y_hat)

                    if POST == 'hard_vote':
                        # do hard voting
                        preds_window.append(y_idx)
                        pred_count = Counter(preds_window)
                        # most_common return .items() format
                        # ex. [(0,5), (1,3)]
                        voted_pred = pred_count.most_common(1)[0][0]
                        class_name = loaded_le.classes_[voted_pred]
                    else:
                        class_name = loaded_le.classes_[y_idx]
                else:
                    class_name = "None"

            # count the rep logic
            current_stage = class_name

            if current_stage == COUNTING_ORDER[exercise]['cur_class'] and previous_stage == COUNTING_ORDER[exercise]['prev_class']:
                rep_count += 1

            previous_stage = current_stage
    
    print(f'{file_name}: {rep_count} from {rep_gt}')
    obo_score = OBO(rep_count, rep_gt)
    all_reps.append(rep_count)
    all_obos.append(obo_score)
    prev_ex = cur_ex
    
df['pred'] = all_reps
df['obo'] = all_obos
ds_name = DS_BASE_DIR.split(os.path.sep)[-1]
df.to_csv(os.path.join(OUT_DIR, f'{MODEL_TYPE}_{POST}_{ds_name}.csv'), index=False)

half-burpee_1_1.mp4: 0 from 11
half-burpee_1_2.mp4: 0 from 9
half-burpee_2_1.mp4: 0 from 3
half-burpee_2_2.mp4: 0 from 4
half-burpee_2_3.mp4: 0 from 3
half-burpee_2_4.mp4: 0 from 3
half-burpee_2_5.mp4: 0 from 4
half-burpee_2_6.mp4: 0 from 2
half-burpee_3_1.mp4: 0 from 4
half-burpee_3_2.mp4: 0 from 4
half-burpee_3_3.mp4: 0 from 4
half-burpee_3_4.mp4: 0 from 3
half-burpee_3_5.mp4: 0 from 4
jumping-jack_1_1.mp4: 0 from 11
jumping-jack_1_2.mp4: 0 from 11
jumping-jack_1_3.mp4: 0 from 12
jumping-jack_2_1.mp4: 0 from 10
jumping-jack_2_2.mp4: 0 from 10
jumping-jack_2_3.mp4: 0 from 7
jumping-jack_2_4.mp4: 0 from 9
jumping-jack_3_1.mp4: 0 from 8
jumping-jack_3_2.mp4: 0 from 7
jumping-jack_3_3.mp4: 0 from 7
jumping-jack_3_4.mp4: 0 from 7
jumping-jack_3_5.mp4: 0 from 7
jumping-jack_3_6.mp4: 0 from 4
jumping-jack_3_7.mp4: 0 from 7
jumping-jack_extra_1.mp4: 0 from 9
jumping-jack_extra_2.mp4: 0 from 6
leg-raise_1_1.mp4: 0 from 9
leg-raise_1_2.mp4: 0 from 8
leg-raise_2_1.mp4: 0 from 4
leg-raise_2_2.mp

In [117]:
df.drop(['pred', 'obo'], axis=1,inplace=True)

In [118]:
df

Unnamed: 0,file_name,rep,exercise
0,half-burpee_1_1.mp4,11,half-burpee
1,half-burpee_1_2.mp4,9,half-burpee
2,half-burpee_2_1.mp4,3,half-burpee
3,half-burpee_2_2.mp4,4,half-burpee
4,half-burpee_2_3.mp4,3,half-burpee
...,...,...,...
63,squat_3_1.mp4,8,squat
64,squat_3_2.mp4,6,squat
65,squat_3_3.mp4,7,squat
66,squat_3_4.mp4,5,squat


In [9]:
## NOTE
# all squat fail from front view (some pushup too)

# half-rep or worse exercise sample
# pushup-6 | pushup-13 | 

# jumping-jack fail from multi person and completely sideview