In [13]:
import os
import sys
import glob
import time
import random
import pandas as pd
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from lib.config import *
from lib.encoder.ffn import FFNEncoder, FFNDecoder
from lib.encoder.vqvae import VQVAE, VectorQuantizer
from lib.train.run_autoencoder_training import AutoencoderTrainer
from lib.utils.pose import get_pose_estimation
import datetime

In [2]:
import os 

os.cpu_count()

8

In [14]:
mp4_files = glob.glob("dataset/corpus/*.mp4")
mp4_files

['dataset/corpus/GEREK _1.mp4',
 'dataset/corpus/I╠çLGINC╠ğ_0.mp4',
 'dataset/corpus/MEMUR_1.mp4',
 'dataset/corpus/BAS╠ğPARMAK_0.mp4',
 'dataset/corpus/YILDIZ_0.mp4',
 'dataset/corpus/ZARF_0.mp4',
 'dataset/corpus/SERVIS_1.mp4',
 'dataset/corpus/HAKLI_0.mp4',
 'dataset/corpus/KIM_1.mp4',
 'dataset/corpus/I╠çKISI_0.mp4',
 'dataset/corpus/DIG╠åER_0.mp4',
 'dataset/corpus/BAYRAK_0.mp4',
 'dataset/corpus/I╠çNS╠ğALLAH_1.mp4',
 'dataset/corpus/HOPARLO╠êR_0.mp4',
 'dataset/corpus/I╠çMZALAMAK_0.mp4',
 'dataset/corpus/GARANTI_0.mp4',
 'dataset/corpus/KUSURA BAKMAMAK_1.mp4',
 'dataset/corpus/SAC╠ğ_0.mp4',
 'dataset/corpus/VERGI _1.mp4',
 'dataset/corpus/YAVAS╠ğLAMAK_0.mp4',
 'dataset/corpus/BAG╠åIMSIZ _0.mp4',
 'dataset/corpus/GO╠êNU╠êL_0.mp4',
 'dataset/corpus/TREN_1.mp4',
 'dataset/corpus/C╠ğAY_1.mp4',
 'dataset/corpus/UC╠ğAK_0.mp4',
 'dataset/corpus/ALET_0.mp4',
 'dataset/corpus/PRATIK_0.mp4',
 'dataset/corpus/I╠çC╠ğIN_0.mp4',
 'dataset/corpus/BIRA_1.mp4',
 'dataset/corpus/

In [15]:
GLOBAL_CONFIG

namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False,
          MEDIAPIPE_MODEL_COMPLEXITY=1,
          MEDIAPIPE_SMOOTH_LANDMARKS=True,
          MEDIAPIPE_ENABLE_SEGMENTATION=False,
          MEDIAPIPE_SMOOTH_SEGMENTATION=True,
          MEDIAPIPE_REFINE_FACE_LANDMARKS=False,
          MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
          MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,
          MODEL_ENCODER_INPUT_DIM=225,
          MODEL_ENCODER_HIDDEN_DIM=256,
          MODEL_ENCODER_OUTPUT_DIM=768,
          MODEL_VQ_NUM_EMBS=10000,
          MODEL_VQ_EMBED_DIM=768,
          MODEL_VQ_COMMITMENT_COST=0.25,
          MODEL_DECODER_INPUT_DIM=768,
          MODEL_DECODER_HIDDEN_DIM=256,
          MODEL_DECODER_OUTPUT_DIM=225,
          NUM_EPOCHS=100,
          BATCH_SIZE=32,
          LEARNING_RATE=1e-05)

In [16]:
GLOBAL_CONFIG.MEDIAPIPE_MODEL_COMPLEXITY = 2
GLOBAL_CONFIG

namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False,
          MEDIAPIPE_MODEL_COMPLEXITY=2,
          MEDIAPIPE_SMOOTH_LANDMARKS=True,
          MEDIAPIPE_ENABLE_SEGMENTATION=False,
          MEDIAPIPE_SMOOTH_SEGMENTATION=True,
          MEDIAPIPE_REFINE_FACE_LANDMARKS=False,
          MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
          MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,
          MODEL_ENCODER_INPUT_DIM=225,
          MODEL_ENCODER_HIDDEN_DIM=256,
          MODEL_ENCODER_OUTPUT_DIM=768,
          MODEL_VQ_NUM_EMBS=10000,
          MODEL_VQ_EMBED_DIM=768,
          MODEL_VQ_COMMITMENT_COST=0.25,
          MODEL_DECODER_INPUT_DIM=768,
          MODEL_DECODER_HIDDEN_DIM=256,
          MODEL_DECODER_OUTPUT_DIM=225,
          NUM_EPOCHS=100,
          BATCH_SIZE=32,
          LEARNING_RATE=1e-05)

In [6]:
def get_analyze_statistics(video_name: str) -> list:
    """Returns analyze statistics as list.

    Args:
        video_name (str):

    Returns:
        list: statistics list
            : video_name
            : num_of_frames_total
            : num_of_frames_missing_pose
            : num_of_frames_missing_left_hand
            : num_of_frames_missing_right_hand
            
    """
    
    # print(f"Analyzing: {video_name}")

    estimation, info = get_pose_estimation(video_name, with_info=True)

    pd_info = pd.DataFrame(info, columns=["video_name", "info", "value"])

    num_of_frames_total = pd_info[pd_info['info']
                                  == 'total_number_of_frames']["value"].iloc[0]
    num_of_frames_missing_pose = 0
    num_of_frames_missing_left_hand = 0
    num_of_frames_missing_right_hand = 0

    if "missing_pose" in pd_info["info"].to_list():
        num_of_frames_missing_pose = pd_info.value_counts("info")[
            "missing_pose"]

    if "missing_left_hand" in pd_info["info"].to_list():
        num_of_frames_missing_left_hand = pd_info.value_counts("info")[
            "missing_left_hand"]

    if "missing_right_hand" in pd_info["info"].to_list():
        num_of_frames_missing_right_hand = pd_info.value_counts("info")[
            "missing_right_hand"]

    return [video_name,
            num_of_frames_total,
            num_of_frames_missing_pose,
            num_of_frames_missing_left_hand,
            num_of_frames_missing_right_hand]


In [None]:
analyze_results = []

for idx, video in enumerate(mp4_files):
    print(f"{idx}/{len(mp4_files)} - Analyzing {video}")
    statistics = get_analyze_statistics(video)
    statistics.append(GLOBAL_CONFIG)
    
    analyze_results.append(statistics)

print(len(analyze_results))

In [13]:
pd_analyze_results = pd.DataFrame(analyze_results,
                                  columns=["video_name", 
                                           "num_of_total_frames", 
                                           "num_of_missing_frames_pose",
                                           "num_of_missing_frames_left_hand",
                                           "num_of_missing_frames_right_hand",
                                           "configuration"
                                           ])
pd_analyze_results

Unnamed: 0,video_name,num_of_total_frames,num_of_missing_frames_pose,num_of_missing_frames_left_hand,num_of_missing_frames_right_hand,configuration
0,dataset/corpus/GEREK _1.mp4,51,0,51,10,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
1,dataset/corpus/I╠çLGINC╠ğ_0.mp4,56,0,56,30,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
2,dataset/corpus/MEMUR_1.mp4,60,0,60,32,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3,dataset/corpus/BAS╠ğPARMAK_0.mp4,41,0,41,19,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
4,dataset/corpus/YILDIZ_0.mp4,66,0,21,22,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
...,...,...,...,...,...,...
3390,dataset/corpus/GIZLI_1.mp4,70,0,48,49,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3391,dataset/corpus/I╠çYI_0.mp4,34,0,34,18,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3392,dataset/corpus/KULAK_0.mp4,63,0,63,27,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3393,dataset/corpus/SEBEP_0.mp4,45,0,21,17,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."


In [15]:
formatted_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Seperator intentionally given as semicolon ";"
# as configuration has comma ","
pd_analyze_results.to_csv(f"analyze/{formatted_datetime}.csv" ,sep=';')

# Optional, write configuration as separate file
# write_global_config_to_file(f"analyze/{formatted_datetime}_config.txt")

# Log Files Analyze

In [133]:
import pandas as pd
import glob 
from types import SimpleNamespace

!pip install tabulate
from tabulate import tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [70]:
log_files = glob.glob("analyze/*.csv")

log_analyze = []

for log_file in log_files:
    df_log = pd.read_csv(log_file, sep=";")
    
    len_missing_pose = df_log.sum()["num_of_missing_frames_pose"]
    len_missing_left_hand = df_log.sum()["num_of_missing_frames_left_hand"]
    len_missing_right_hand = df_log.sum()["num_of_missing_frames_right_hand"]
    
    log_analyze.append({
        "name" : log_file,
        "num_of_missing_frames_total" : len_missing_pose + len_missing_left_hand + len_missing_right_hand,
        "num_of_missing_frames_pose" : len_missing_pose,
        "num_of_missing_frames_left_hand"  : len_missing_left_hand,
        "num_of_missing_frames_right_hand" : len_missing_right_hand,
        "configuration" : df_log["configuration"].iloc[0] }
    )

pd_log_analyze = pd.DataFrame(log_analyze)
pd_log_analyze

Unnamed: 0,name,num_of_missing_frames_total,num_of_missing_frames_pose,num_of_missing_frames_left_hand,num_of_missing_frames_right_hand,configuration
0,analyze/20240402_163040.csv,210885,154,128161,82570,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
1,analyze/20240402_064717.csv,210949,154,128197,82598,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
2,analyze/20240401_204336.csv,211452,154,128180,83118,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3,analyze/20240401_145215.csv,211467,154,128176,83137,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
4,analyze/20240402_233240.csv,210935,154,128167,82614,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
5,analyze/20240401_073232.csv,211410,154,128136,83120,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
6,analyze/20240401_000331.csv,210874,154,128146,82574,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
7,analyze/20240331_092627.csv,211417,154,128182,83081,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."


In [71]:
pd_log_analyze.sort_values("num_of_missing_frames_total")

Unnamed: 0,name,num_of_missing_frames_total,num_of_missing_frames_pose,num_of_missing_frames_left_hand,num_of_missing_frames_right_hand,configuration
6,analyze/20240401_000331.csv,210874,154,128146,82574,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
0,analyze/20240402_163040.csv,210885,154,128161,82570,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
4,analyze/20240402_233240.csv,210935,154,128167,82614,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
1,analyze/20240402_064717.csv,210949,154,128197,82598,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
5,analyze/20240401_073232.csv,211410,154,128136,83120,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
7,analyze/20240331_092627.csv,211417,154,128182,83081,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
2,analyze/20240401_204336.csv,211452,154,128180,83118,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."
3,analyze/20240401_145215.csv,211467,154,128176,83137,"namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False, M..."


In [119]:
best_configuration = pd_log_analyze.sort_values("num_of_missing_frames_total").iloc[0]["configuration"]
best_configuration = best_configuration.replace('namespace', '')
best_configuration = best_configuration.replace('(', '')

for item in best_configuration.split(" "):
    if "MEDIAPIPE" in item:
        print(item)

MEDIAPIPE_STATIC_IMAGE_MODE=False,
MEDIAPIPE_MODEL_COMPLEXITY=2,
MEDIAPIPE_SMOOTH_LANDMARKS=True,
MEDIAPIPE_ENABLE_SEGMENTATION=False,
MEDIAPIPE_SMOOTH_SEGMENTATION=True,
MEDIAPIPE_REFINE_FACE_LANDMARKS=False,
MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,


In [142]:
def get_mediapipe_config(config):
    config = config.replace('namespace', '')
    config = config.replace('(', '')
    
    mediapipe_config = ""
    for item in config.split(" "):
        if "COMPLEXITY" in item or "CONFIDENCE" in item:
            mediapipe_config += item + "\n"

    return mediapipe_config


tabulated_rows = []
for index, row in pd_log_analyze.sort_values("num_of_missing_frames_total").iterrows():

    print_row = []
    headers = ["name", 
               "num_of_missing_frames_total", 
               "num_of_missing_frames_pose", 
               "num_of_missing_frames_left_hand",
               "num_of_missing_frames_right_hand"]
    
    for header in headers:
        print_row.append(row[header])    
    
    print_row.append(get_mediapipe_config(row["configuration"]))
    
    tabulated_rows.append(print_row)
    
headers.append("configuration")
print(tabulate(tabulated_rows , headers=headers ))

name                           num_of_missing_frames_total    num_of_missing_frames_pose    num_of_missing_frames_left_hand    num_of_missing_frames_right_hand  configuration
---------------------------  -----------------------------  ----------------------------  ---------------------------------  ----------------------------------  ----------------------------------------
analyze/20240401_000331.csv                         210874                           154                             128146                               82574  MEDIAPIPE_MODEL_COMPLEXITY=2,
                                                                                                                                                                 MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
                                                                                                                                                                 MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,
analyze/20240402_163040.csv    

In [49]:
pd_log_analyze.sort_values("num_of_missing_frames_left_hand")

Unnamed: 0,name,num_of_missing_frames_pose,num_of_missing_frames_left_hand,num_of_missing_frames_right_hand,configuration,total_missing_frames
5,analyze/20240401_073232.csv,154,128136,83120,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,211410
6,analyze/20240401_000331.csv,154,128146,82574,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,210874
0,analyze/20240402_163040.csv,154,128161,82570,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,210885
4,analyze/20240402_233240.csv,154,128167,82614,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,210935
3,analyze/20240401_145215.csv,154,128176,83137,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,211467
2,analyze/20240401_204336.csv,154,128180,83118,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,211452
7,analyze/20240331_092627.csv,154,128182,83081,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,211417
1,analyze/20240402_064717.csv,154,128197,82598,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...,210949


In [47]:
pd_log_analyze.sort_values("num_of_missing_frames_right_hand")

Unnamed: 0,name,num_of_missing_frames_pose,num_of_missing_frames_left_hand,num_of_missing_frames_right_hand,configuration
0,analyze/20240402_163040.csv,154,128161,82570,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
6,analyze/20240401_000331.csv,154,128146,82574,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
1,analyze/20240402_064717.csv,154,128197,82598,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
4,analyze/20240402_233240.csv,154,128167,82614,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
7,analyze/20240331_092627.csv,154,128182,83081,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
2,analyze/20240401_204336.csv,154,128180,83118,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
5,analyze/20240401_073232.csv,154,128136,83120,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...
3,analyze/20240401_145215.csv,154,128176,83137,0 namespace(MEDIAPIPE_STATIC_IMAGE_MODE=...


# Extract Features (x,y,z coordinates) with the Best MediaPipe Model
- MEDIAPIPE_MODEL_COMPLEXITY = 2
- MEDIAPIPE_MIN_DETECTION_CONFIDENCE = 0.50
- MEDIAPIPE_MIN_TRACKING_CONFIDENCE = 0.50

In [1]:
import os
import sys
import glob
import time
import random
import pandas as pd
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from lib.config import *
from lib.encoder.ffn import FFNEncoder, FFNDecoder
from lib.encoder.vqvae import VQVAE, VectorQuantizer
from lib.train.run_autoencoder_training import AutoencoderTrainer
from lib.utils.pose import get_pose_estimation
import datetime

In [2]:
# Analysis of previous features format and shape control.
npy_abartmak = np.load("dataset/pose/ABARTMAK_0.npy")

In [3]:
# ABARTMAK_0 video has 54 frames.
# Each frame has 75 features with 3 coordinates (x,y,z), 75x3 = 225 points.
# So the shape is (54,225)
npy_abartmak.shape

(54, 225)

In [4]:
npy_abartmak[0].shape

(225,)

In [5]:
npy_abartmak[0]

array([ 0.46980599,  0.24260582, -0.76693153,  0.48705089,  0.20441961,
       -0.73404068,  0.49691433,  0.20318872, -0.73432118,  0.50666219,
        0.2029556 , -0.7345717 ,  0.45430273,  0.21129489, -0.73008591,
        0.44399834,  0.21388048, -0.73012489,  0.43469137,  0.21603036,
       -0.730501  ,  0.52446949,  0.22176373, -0.47543025,  0.426355  ,
        0.2384519 , -0.44684628,  0.49254787,  0.28345418, -0.66680861,
        0.45368654,  0.28756517, -0.65895313,  0.60852945,  0.47836378,
       -0.3089987 ,  0.36462992,  0.46086848, -0.23654462,  0.63774163,
        0.75269014, -0.23200703,  0.32706618,  0.72681522, -0.14431894,
        0.63423103,  0.95979345, -0.47679883,  0.31722227,  0.93455708,
       -0.44344983,  0.63354498,  1.04028308, -0.56017518,  0.31190717,
        1.0080514 , -0.52229625,  0.61795419,  1.02328861, -0.61977941,
        0.32109481,  0.98787445, -0.60448807,  0.6078282 ,  0.98132926,
       -0.50781858,  0.33461523,  0.95298123, -0.48524538,  0.54

In [6]:
GLOBAL_CONFIG

namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False,
          MEDIAPIPE_MODEL_COMPLEXITY=1,
          MEDIAPIPE_SMOOTH_LANDMARKS=True,
          MEDIAPIPE_ENABLE_SEGMENTATION=False,
          MEDIAPIPE_SMOOTH_SEGMENTATION=True,
          MEDIAPIPE_REFINE_FACE_LANDMARKS=False,
          MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
          MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,
          MODEL_ENCODER_INPUT_DIM=225,
          MODEL_ENCODER_HIDDEN_DIM=256,
          MODEL_ENCODER_OUTPUT_DIM=768,
          MODEL_VQ_NUM_EMBS=10000,
          MODEL_VQ_EMBED_DIM=768,
          MODEL_VQ_COMMITMENT_COST=0.25,
          MODEL_DECODER_INPUT_DIM=768,
          MODEL_DECODER_HIDDEN_DIM=256,
          MODEL_DECODER_OUTPUT_DIM=225,
          NUM_EPOCHS=100,
          BATCH_SIZE=32,
          LEARNING_RATE=1e-05)

In [7]:
GLOBAL_CONFIG.MEDIAPIPE_MODEL_COMPLEXITY = 2
GLOBAL_CONFIG.MEDIAPIPE_MIN_TRACKING_CONFIDENCE = 0.5
GLOBAL_CONFIG.MEDIAPIPE_MIN_DETECTION_CONFIDENCE = 0.5

In [8]:
GLOBAL_CONFIG

namespace(MEDIAPIPE_STATIC_IMAGE_MODE=False,
          MEDIAPIPE_MODEL_COMPLEXITY=2,
          MEDIAPIPE_SMOOTH_LANDMARKS=True,
          MEDIAPIPE_ENABLE_SEGMENTATION=False,
          MEDIAPIPE_SMOOTH_SEGMENTATION=True,
          MEDIAPIPE_REFINE_FACE_LANDMARKS=False,
          MEDIAPIPE_MIN_DETECTION_CONFIDENCE=0.5,
          MEDIAPIPE_MIN_TRACKING_CONFIDENCE=0.5,
          MODEL_ENCODER_INPUT_DIM=225,
          MODEL_ENCODER_HIDDEN_DIM=256,
          MODEL_ENCODER_OUTPUT_DIM=768,
          MODEL_VQ_NUM_EMBS=10000,
          MODEL_VQ_EMBED_DIM=768,
          MODEL_VQ_COMMITMENT_COST=0.25,
          MODEL_DECODER_INPUT_DIM=768,
          MODEL_DECODER_HIDDEN_DIM=256,
          MODEL_DECODER_OUTPUT_DIM=225,
          NUM_EPOCHS=100,
          BATCH_SIZE=32,
          LEARNING_RATE=1e-05)

In [9]:
estimation, info = get_pose_estimation("dataset/corpus/ABARTMAK_0.mp4")
estimation

I0000 00:00:1713889060.421182       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


{'pose': {'NOSE': [array([0.47333539, 0.25699741]),
   array([0.473744  , 0.25684109]),
   array([0.47421065, 0.25680396]),
   array([0.47422951, 0.25670484]),
   array([0.47375834, 0.25657251]),
   array([0.47313154, 0.25706485]),
   array([0.47277117, 0.25706029]),
   array([0.4716534 , 0.25837141]),
   array([0.46844336, 0.26130524]),
   array([0.46637729, 0.26306325]),
   array([0.46356186, 0.26719874]),
   array([0.46102089, 0.2693041 ]),
   array([0.46059084, 0.27002934]),
   array([0.46068838, 0.26989642]),
   array([0.46120131, 0.26977569]),
   array([0.46154481, 0.26972553]),
   array([0.4626106 , 0.26946247]),
   array([0.46468174, 0.26834789]),
   array([0.46581745, 0.26592144]),
   array([0.46748421, 0.26371834]),
   array([0.46968067, 0.26125252]),
   array([0.473739  , 0.25941655]),
   array([0.47687447, 0.25846058]),
   array([0.47952467, 0.25822198]),
   array([0.48149925, 0.25772682]),
   array([0.48293203, 0.25698498]),
   array([0.48318577, 0.25683555]),
   array([0.

In [10]:
number_of_frames = len(estimation["pose"]["NOSE"])
number_of_frames

54

In [11]:
# How .npy file format will be
for frame in range(number_of_frames):
    for key in estimation.keys():                     # pose, right, left
        for inner_key in estimation[key].keys():      # NOSE, LEFT_EYE_INNER ... WRIST ... PINKY_TIP
            print(key, inner_key, frame)
    break

pose NOSE 0
pose LEFT_EYE_INNER 0
pose LEFT_EYE 0
pose LEFT_EYE_OUTER 0
pose RIGHT_EYE_INNER 0
pose RIGHT_EYE 0
pose RIGHT_EYE_OUTER 0
pose LEFT_EAR 0
pose RIGHT_EAR 0
pose MOUTH_LEFT 0
pose MOUTH_RIGHT 0
pose LEFT_SHOULDER 0
pose RIGHT_SHOULDER 0
pose LEFT_ELBOW 0
pose RIGHT_ELBOW 0
pose LEFT_WRIST 0
pose RIGHT_WRIST 0
pose LEFT_PINKY 0
pose RIGHT_PINKY 0
pose LEFT_INDEX 0
pose RIGHT_INDEX 0
pose LEFT_THUMB 0
pose RIGHT_THUMB 0
pose LEFT_HIP 0
pose RIGHT_HIP 0
pose LEFT_KNEE 0
pose RIGHT_KNEE 0
pose LEFT_ANKLE 0
pose RIGHT_ANKLE 0
pose LEFT_HEEL 0
pose RIGHT_HEEL 0
pose LEFT_FOOT_INDEX 0
pose RIGHT_FOOT_INDEX 0
right WRIST 0
right THUMB_CMC 0
right THUMB_MCP 0
right THUMB_IP 0
right THUMB_TIP 0
right INDEX_FINGER_MCP 0
right INDEX_FINGER_PIP 0
right INDEX_FINGER_DIP 0
right INDEX_FINGER_TIP 0
right MIDDLE_FINGER_MCP 0
right MIDDLE_FINGER_PIP 0
right MIDDLE_FINGER_DIP 0
right MIDDLE_FINGER_TIP 0
right RING_FINGER_MCP 0
right RING_FINGER_PIP 0
right RING_FINGER_DIP 0
right RING_FINGER_T

In [12]:
# How .npy file format will be
features = []

for frame in range(number_of_frames):
    
    linear_feature = []
    for key in estimation.keys():                     # pose, right, left
        for inner_key in estimation[key].keys():      # NOSE, LEFT_EYE_INNER ... WRIST ... PINKY_TIP
            # print(key, inner_key, frame)
            linear_feature.append(estimation[key][inner_key][frame])
        
    features.append(linear_feature)

In [13]:
npy_features = np.asarray(features)
npy_features

array([[[0.47333539, 0.25699741],
        [0.48809201, 0.21823561],
        [0.49744898, 0.21913093],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.473744  , 0.25684109],
        [0.48808417, 0.21823575],
        [0.49815124, 0.21892767],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.47421065, 0.25680396],
        [0.488083  , 0.21824443],
        [0.49868104, 0.21880187],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       ...,

       [[0.48775476, 0.25402632],
        [0.50217664, 0.21377172],
        [0.51407564, 0.2139025 ],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.48455131, 0.25410318],
        [0.49884599, 0.21359926],
        [0.5105232 , 0.21352053],
        .

In [14]:
# 54 frame
# 75 feature
# 2 coordinates, x and y
npy_features.shape

(54, 75, 2)

In [19]:
def get_features_as_npy_array(video_path : str) -> np.array:
    
    estimation, info = get_pose_estimation(video_path)
    
    number_of_frames = len(estimation["pose"]["NOSE"])
    
    features = []

    for frame in range(number_of_frames):
        
        linear_feature = []
        for key in estimation.keys():                     # pose, right, left
            for inner_key in estimation[key].keys():      # NOSE, LEFT_EYE_INNER ... WRIST ... PINKY_TIP
                # print(key, inner_key, frame)
                linear_feature.append(estimation[key][inner_key][frame])
            
        features.append(linear_feature)
    
    npy_features = np.asarray(features)
    npy_features = npy_features.reshape((npy_features.shape[0], -1))
    
    return npy_features

In [23]:
# dataset/corpus/ABARTMAK_0.mp4 video has 54 frames.
# Each frame has 75 features and two coordinate points, X and Y, 75x2 = 150
test_video_result = get_features_as_npy_array("dataset/corpus/ABARTMAK_0.mp4")
test_video_result.shape == (54,150)

I0000 00:00:1713889353.535865       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


True

In [None]:
mp4_files = glob.glob("dataset/corpus/*.mp4")

for mp4_file in mp4_files:
    filename = os.path.basename(mp4_file).replace(".mp4", ".npy")
    result_path = os.path.join("dataset", "features", filename)
    
    result_npy = get_features_as_npy_array(mp4_file)
    
    np.save(result_path, result_npy)