In [1]:
!pip install -r requirements.txt

Collecting vector-quantize-pytorch (from -r requirements.txt (line 4))
  Obtaining dependency information for vector-quantize-pytorch from https://files.pythonhosted.org/packages/25/ed/71b701d355c3c2c3454fa002efd1fb0270a083080777038ec2c95a63523f/vector_quantize_pytorch-1.14.6-py3-none-any.whl.metadata
  Downloading vector_quantize_pytorch-1.14.6-py3-none-any.whl.metadata (716 bytes)
Collecting einops>=0.7.0 (from vector-quantize-pytorch->-r requirements.txt (line 4))
  Obtaining dependency information for einops>=0.7.0 from https://files.pythonhosted.org/packages/29/0b/2d1c0ebfd092e25935b86509a9a817159212d82aa43d7fb07eca4eeff2c2/einops-0.7.0-py3-none-any.whl.metadata
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting einx[torch]>=0.1.3 (from vector-quantize-pytorch->-r requirements.txt (line 4))
  Downloading einx-0.1.3.tar.gz (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 kB[0m [31m748.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Pre

In [1]:
import os
import sys
import glob
import time
import random
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from vector_quantize_pytorch import VectorQuantize, ResidualVQ

from lib.config import *
from lib.encoder.ffn import FFNEncoder, FFNDecoder
from lib.encoder.vqvae import VQVAE
from lib.train.run_autoencoder_training import AutoencoderTrainer

### 1. Pose Inference

In [None]:
!pip install opencv-python
!pip install mediapipe

In [None]:
from lib.utils.pose import get_pose_estimation

SAMPLE = 'dataset/corpus/ABARTMAK_0.mp4'

SAMPLE_POSE = get_pose_estimation(SAMPLE)

In [5]:
import pandas as pd

def get_pose_array(SAMPLE_POSE):
    """Converts the pose data into a numpy array
    """

    POSE_RAW = pd.DataFrame(SAMPLE_POSE['pose'])
    RIGHT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['right'])
    LEFT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['left'])

    POSE_DF = {}

    for col in POSE_RAW.columns:
        POSE_DF[ 'POSE_' + col + '_X'] = POSE_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'POSE_' + col + '_Y'] = POSE_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'POSE_' + col + '_Z'] = POSE_RAW[col].apply(lambda x: x[2])
        # POSE_DF[col + '_viz'] = POSE_RAW[col].apply(lambda x: x[3])

    for col in RIGHT_HAND_RAW.columns:
        POSE_DF[ 'RIGHT_' + col + '_X' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'RIGHT_' + col + '_Y' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'RIGHT_' + col + '_Z' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['RIGHT_' + col + '_viz'] = RIGHT_HAND_RAW[col].apply(lambda x: x[3])

    for col in LEFT_HAND_RAW.columns:
        POSE_DF[ 'LEFT_' + col + '_X' ] = LEFT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'LEFT_' + col + '_Y' ] = LEFT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'LEFT_' + col + '_Z' ] = LEFT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['LEFT_' + col + '_viz'] = LEFT_HAND_RAW[col].apply(lambda x: x[3])

    POSE_DF = pd.DataFrame(POSE_DF)

    return POSE_DF.to_numpy()

In [None]:
import glob
import numpy as np
from tqdm.notebook import tqdm
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ARRAY_PATH = 'dataset/pose/'
    for datapath in tqdm(glob.glob('dataset/corpus/*.mp4')):
        print(datapath)
        pose = get_pose_estimation(datapath)
        pose_array = get_pose_array(pose)
        print(pose_array.shape, datapath)
        dname = datapath.split('/')[-1].replace('.mp4', '.npy')
        with open(ARRAY_PATH+'/'+dname, 'wb') as f:
            np.save(f, pose_array)

### 2. Graph Autoencoder Training

In [2]:
DATA_PATH = 'dataset/pose/'
data = glob.glob(DATA_PATH + '*.npy')

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
from torch.utils.data import DataLoader
from lib.data.dataset import PoseDataset

train_dataset = PoseDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True
)

100%|██████████| 2716/2716 [00:02<00:00, 908.51it/s]


In [5]:
val_dataset = PoseDataset(X_val)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True
)

100%|██████████| 679/679 [00:00<00:00, 993.78it/s] 


In [6]:
torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [7]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = FFNEncoder(
    input_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_ENCODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_OUTPUT_DIM,
)

MODEL_DECODER = FFNDecoder(
    input_dim=GLOBAL_CONFIG.MODEL_DECODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_DECODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
)

MODEL_QUANT = ResidualVQ(
    dim = GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    stochastic_sample_codes = True,
    num_quantizers=1,      # specify number of quantizers
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_NUM_EMBS,    # codebook size           # the weight on the commitment loss
    kmeans_init=True,   # set to True
    kmeans_iters=100     # number of kmeans iterations to calculate the centroids for the codebook on init
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT,
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader,
    num_epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    device='cpu',
)

In [8]:
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

: 

In [9]:
from tqdm.notebook import tqdm

MODEL_VQVAE.eval()

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        quantized, indices, commitment_loss = MODEL_VQVAE(train_sample['array'].float())
        dfs.append(pd.DataFrame({
            'videos': train_sample['token'],
            'labels': indices.detach().cpu().numpy().reshape(-1)
        }))

  0%|          | 0/4728 [00:00<?, ?it/s]

In [10]:
df = pd.concat(dfs)

In [14]:
pd.DataFrame(df.labels.value_counts()).tail(50)

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
1344,1716
5064,1702
497,1675
6259,1639
8327,1626
8766,1547
4175,1537
7469,1489
9784,1414
4077,1381


In [15]:
df[df['labels'] == 7468].head(10)

Unnamed: 0,videos,labels
6,KUAFO╠êR_0.npy,7468
10,YUMUS╠ğAK _0.npy,7468
8,KRIZ_0.npy,7468
27,ONARMAK_0.npy,7468
18,BUHAR_0.npy,7468
3,YAG╠åMAK_0.npy,7468
5,VAY_1.npy,7468
1,MANZARA_0.npy,7468
23,YAYLA_0.npy,7468
0,BAS╠ğARI_0.npy,7468
