In [2]:
# Convert to input format for MMAction2
import glob
import re
import pickle
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split

exercise_dict = {
    'm01':'deep squat',
    'm02':'hurdle step',
    'm03':'inline lunge',
    'm04':'side lunge',
    'm05':'sit to stand',
    'm06':'standing active leg raise',
    'm07':'standing shoulder abduction',
    'm08':'standing shoulder extension',
    'm09':'standing shoulder internal-external rotation',
    'm10':'standing shoulder scaption' 
}

exercise_number_dict = {
    'm01':0,
    'm02':1,
    'm03':2,
    'm04':3,
    'm05':4,
    'm06':5,
    'm07':6,
    'm08':7,
    'm09':8,
    'm10':9 
}

exercise_number_dict_20 = {
    'm01':0,
    'm02':2,
    'm03':4,
    'm04':6,
    'm05':8,
    'm06':10,
    'm07':12,
    'm08':14,
    'm09':16,
    'm10':18 
}

# Dictionary to store data per subject
subject_data = defaultdict(list)

exercises_npy = [file for file in glob.glob("../../Scale Normalized Filtered Skeletal Data/*.npy")]
assert (len(exercises_npy) == 2000), f"length is: {len(exercises_npy)}"

exercise_annotations = []

for file, exercise_data in enumerate(exercises_npy):
    frame_dir = ""
    
    # Get exercise name
    exercise_name_match = re.search(r"m0[1-9]|m10", exercise_data)
    exercise_name = ""
    exercise_label_key = -1
    if exercise_name_match:
        exercise_label_key = exercise_name_match.group(0)
        exercise_name = exercise_dict.get(exercise_label_key, "Unknown exercise")
        
    else:
        continue
        # print("Key not found in the input string.")
        
    # Get identifier (frame_dir)
    identifier_match = re.search(r"_s\d{2}_e\d{2}", exercise_data)    
    identifier_name = ""
    if identifier_match:
        identifier_name = identifier_match.group(0)
    else:
        print("Pattern not found: ", exercise_name)
    
    frame_dir = exercise_name + identifier_name
    
    # Get correct or incorrect - label 0 for incorrect, 1 for correct
    incorrect_match = re.search(r"inc", exercise_data)
    if incorrect_match:
        frame_dir += "_inc"
        label = 1
    else:
        frame_dir += "_corr"
        label = 0
            
    # Prepare other values for dictionary
    exercise_npy = np.load(exercise_data) # Shape (22x3xnum_frames)
    
    total_frames = exercise_npy.shape[2]
    
    # Keypoint format requires 4d array of size [MxTxVxC] - data is currently [VxCxT]
    exercise_npy_transposed = exercise_npy.transpose(2,0,1) # Puts it in [TxVxC]
    keypoint = np.expand_dims(exercise_npy_transposed, axis=0) # Adds in M dimension
    expanded_keypoint = np.ones_like(keypoint)  # Create an array of the same shape filled with 1
    expanded_keypoint[0] = exercise_npy_transposed  # Fill the original data back into the array    

    exercise_json = {
        'frame_dir': frame_dir,
        'label': int(label),
        'total_frames':total_frames,
        'keypoint':expanded_keypoint,
        'exercise_name':exercise_name # Extra key-value for identifying type of exercise
    }    
    
    exercise_annotations.append(exercise_json)

# Shuffle and split data into 80% training and 20% validation
train_data, val_data = train_test_split(
    exercise_annotations, test_size=0.2, random_state=42
)


print(f"Total data size: {len(exercise_annotations)}")
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")

# Define the input files for MMAction2
loso_data = {
    "split": {
        "xsub_train": [item['frame_dir'] for item in train_data],
        "xsub_val": [item['frame_dir'] for item in val_data]
    },
    "annotations": train_data + val_data  # Include all data
}
    
# Save to a pickle file for each LOSO split
with open(f'80_20.pkl', 'wb') as f:
    pickle.dump(loso_data, f)

Total data size: 2000
Training data size: 1600
Validation data size: 400
