In [1]:
import os
import sys
import glob
import time
import random
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from lib.config import *
from lib.encoder.ffn import FFNEncoder, FFNDecoder
from lib.encoder.vqvae import VQVAE, VectorQuantizer
from lib.train.run_autoencoder_training import AutoencoderTrainer

### 1. Pose Inference

In [None]:
!pip install opencv-python
!pip install mediapipe

In [None]:
from lib.utils.pose import get_pose_estimation

SAMPLE = 'dataset/corpus/ABARTMAK_0.mp4'

SAMPLE_POSE = get_pose_estimation(SAMPLE)

In [5]:
import pandas as pd

def get_pose_array(SAMPLE_POSE):
    """Converts the pose data into a numpy array
    """

    POSE_RAW = pd.DataFrame(SAMPLE_POSE['pose'])
    RIGHT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['right'])
    LEFT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['left'])

    POSE_DF = {}

    for col in POSE_RAW.columns:
        POSE_DF[ 'POSE_' + col + '_X'] = POSE_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'POSE_' + col + '_Y'] = POSE_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'POSE_' + col + '_Z'] = POSE_RAW[col].apply(lambda x: x[2])
        # POSE_DF[col + '_viz'] = POSE_RAW[col].apply(lambda x: x[3])

    for col in RIGHT_HAND_RAW.columns:
        POSE_DF[ 'RIGHT_' + col + '_X' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'RIGHT_' + col + '_Y' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'RIGHT_' + col + '_Z' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['RIGHT_' + col + '_viz'] = RIGHT_HAND_RAW[col].apply(lambda x: x[3])

    for col in LEFT_HAND_RAW.columns:
        POSE_DF[ 'LEFT_' + col + '_X' ] = LEFT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'LEFT_' + col + '_Y' ] = LEFT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'LEFT_' + col + '_Z' ] = LEFT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['LEFT_' + col + '_viz'] = LEFT_HAND_RAW[col].apply(lambda x: x[3])

    POSE_DF = pd.DataFrame(POSE_DF)

    return POSE_DF.to_numpy()

In [None]:
import glob
import numpy as np
from tqdm.notebook import tqdm
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ARRAY_PATH = 'dataset/pose/'
    for datapath in tqdm(glob.glob('dataset/corpus/*.mp4')):
        print(datapath)
        pose = get_pose_estimation(datapath)
        pose_array = get_pose_array(pose)
        print(pose_array.shape, datapath)
        dname = datapath.split('/')[-1].replace('.mp4', '.npy')
        with open(ARRAY_PATH+'/'+dname, 'wb') as f:
            np.save(f, pose_array)

In [3]:
POSE_DF.head(10)

Unnamed: 0,POSE_NOSE_X,POSE_NOSE_Y,POSE_NOSE_Z,POSE_LEFT_EYE_INNER_X,POSE_LEFT_EYE_INNER_Y,POSE_LEFT_EYE_INNER_Z,POSE_LEFT_EYE_X,POSE_LEFT_EYE_Y,POSE_LEFT_EYE_Z,POSE_LEFT_EYE_OUTER_X,...,LEFT_PINKY_MCP_Z,LEFT_PINKY_PIP_X,LEFT_PINKY_PIP_Y,LEFT_PINKY_PIP_Z,LEFT_PINKY_DIP_X,LEFT_PINKY_DIP_Y,LEFT_PINKY_DIP_Z,LEFT_PINKY_TIP_X,LEFT_PINKY_TIP_Y,LEFT_PINKY_TIP_Z
0,0.469806,0.242606,-0.766932,0.487051,0.20442,-0.734041,0.496914,0.203189,-0.734321,0.506662,...,,,,,,,,,,
1,0.470035,0.242358,-0.774341,0.487213,0.20448,-0.739161,0.49772,0.203515,-0.739494,0.507534,...,,,,,,,,,,
2,0.470407,0.242079,-0.779626,0.487484,0.204492,-0.742903,0.498458,0.203672,-0.743197,0.508245,...,,,,,,,,,,
3,0.470645,0.241855,-0.774343,0.487634,0.204496,-0.734576,0.498899,0.203771,-0.734834,0.508677,...,,,,,,,,,,
4,0.470878,0.241824,-0.696069,0.487784,0.204503,-0.656863,0.499225,0.203839,-0.657091,0.508947,...,,,,,,,,,,
5,0.469955,0.242016,-0.714645,0.486899,0.204655,-0.677112,0.498503,0.204091,-0.677342,0.508513,...,,,,,,,,,,
6,0.468617,0.242897,-0.721925,0.485244,0.205087,-0.68607,0.497028,0.204452,-0.686267,0.507402,...,,,,,,,,,,
7,0.468346,0.243676,-0.725536,0.484669,0.20555,-0.686524,0.496354,0.204834,-0.686594,0.506824,...,,,,,,,,,,
8,0.464956,0.24738,-0.718825,0.480842,0.208866,-0.682417,0.492176,0.20766,-0.682485,0.502795,...,,,,,,,,,,
9,0.462591,0.25139,-0.569659,0.477768,0.213662,-0.533079,0.488384,0.212251,-0.533273,0.499159,...,,,,,,,,,,


In [4]:
(21 + 21 + 33) * 3 == POSE_DF.shape[1]

True

### 2. Graph Autoencoder Training

In [2]:
DATA_PATH = 'dataset/pose/'
data = glob.glob(DATA_PATH + '*.npy')

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
from torch.utils.data import DataLoader
from lib.data.dataset import PoseDataset

train_dataset = PoseDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True
)

100%|██████████| 2716/2716 [00:01<00:00, 1362.29it/s]


In [18]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = FFNEncoder(
    input_dim=MODEL_ENCODER_INPUT_DIM,
    hidden_dim=MODEL_ENCODER_HIDDEN_DIM,
    output_dim=MODEL_ENCODER_OUTPUT_DIM,
)

MODEL_DECODER = FFNDecoder(
    input_dim=MODEL_DECODER_INPUT_DIM,
    hidden_dim=MODEL_DECODER_HIDDEN_DIM,
    output_dim=MODEL_ENCODER_INPUT_DIM,
)

MODEL_QUANT = VectorQuantizer(
    num_embeddings=MODEL_VQ_NUM_EMBS,
    embedding_dim=MODEL_VQ_EMBED_DIM,
    commitment_cost=MODEL_VQ_COMMITMENT_COST
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=LEARNING_RATE,
    device='cpu'
)

In [None]:
trainer.train(
    train_dataloader, 
    num_epochs=NUM_EPOCHS,
)

In [None]:
MODEL_VQVAE.eval()

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        loss, x_recon, min_distances_indices, quantized = MODEL_VQVAE(train_sample['array'].float())
        dfs.append(pd.DataFrame({
            'videos': train_sample['token'],
            'labels': min_distances_indices.detach().cpu().numpy().reshape(-1)
        }))

In [37]:
df = pd.concat(dfs)

In [39]:
df.labels.value_counts()

labels
14      8211
4706    8060
8625     368
1414     256
5061     255
4662     168
2365     144
5073     125
1882      97
296       85
8416      58
6545      45
9206       9
68         7
Name: count, dtype: int64

In [42]:
df[df['labels'] == 8416].head(10)

Unnamed: 0,videos,labels
23,"ELEKTRONIK POSTA, E-POSTA_0.npy",8416
21,"SAG╠åIR, I╠çS╠ğITME ENGELLI_1.npy",8416
9,ON IKI_1.npy,8416
20,LIRA_2.npy,8416
23,SAYGISIZ_0.npy,8416
17,KU╠êFU╠êR_1.npy,8416
8,KADIN_0.npy,8416
16,AG╠åRITMAK_1.npy,8416
12,"SO╠êYLEMEK, DEMEK_0.npy",8416
1,YUKARI_0.npy,8416
