In [2]:
!pip install -r requirements.txt
!pip install opencv-python
!pip install mediapipe



In [1]:
import os
import sys
import glob
import time
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from vector_quantize_pytorch import VectorQuantize, ResidualVQ

from lib.config import *
from lib.encoder.vqvae import VQVAE
from lib.utils.pose import get_pose_estimation
from lib.encoder.ffn import FFNEncoder, FFNDecoder
from lib.encoder.cnn import CNNEncoder, CNNDecoder
from lib.data.dataset import PoseDataset, PoseDistanceDataset
from lib.train.run_autoencoder_training import AutoencoderTrainer

### 1. Pose Inference

In [2]:
SAMPLE = 'dataset/corpus/ABARTMAK_0.mp4'
SAMPLE_POSE = get_pose_estimation(SAMPLE)














INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [3]:
import pandas as pd

def get_pose_array(SAMPLE_POSE):
    """Converts the pose data into a numpy array
    """

    POSE_RAW = pd.DataFrame(SAMPLE_POSE['pose'])
    RIGHT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['right'])
    LEFT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['left'])

    POSE_DF = {}

    for col in POSE_RAW.columns:
        POSE_DF[ 'POSE_' + col + '_X'] = POSE_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'POSE_' + col + '_Y'] = POSE_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'POSE_' + col + '_Z'] = POSE_RAW[col].apply(lambda x: x[2])
        # POSE_DF[col + '_viz'] = POSE_RAW[col].apply(lambda x: x[3])

    for col in RIGHT_HAND_RAW.columns:
        POSE_DF[ 'RIGHT_' + col + '_X' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'RIGHT_' + col + '_Y' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'RIGHT_' + col + '_Z' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['RIGHT_' + col + '_viz'] = RIGHT_HAND_RAW[col].apply(lambda x: x[3])

    for col in LEFT_HAND_RAW.columns:
        POSE_DF[ 'LEFT_' + col + '_X' ] = LEFT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'LEFT_' + col + '_Y' ] = LEFT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'LEFT_' + col + '_Z' ] = LEFT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['LEFT_' + col + '_viz'] = LEFT_HAND_RAW[col].apply(lambda x: x[3])

    POSE_DF = pd.DataFrame(POSE_DF)

    return POSE_DF

In [8]:
POSE_DF = get_pose_array(SAMPLE_POSE[0])

In [11]:
POSE_DF = POSE_DF.replace(np.nan,0)

In [29]:
def get_matrices(POSE_DF):
    """Converts the pose data into a numpy array of distance matrices
    """
    x_cols = [col for col in POSE_DF.columns if col.endswith('_X')]
    y_cols = [col for col in POSE_DF.columns if col.endswith('_Y')]
    z_cols = [col for col in POSE_DF.columns if col.endswith('_Z')]

    frames = []
    for i in range(1, POSE_DF.shape[0]):
        x_row = POSE_DF[x_cols].iloc[i].to_numpy()
        y_row = POSE_DF[y_cols].iloc[i].to_numpy()
        z_row = POSE_DF[z_cols].iloc[i].to_numpy()

        def get_difference_matrix(row):
            m, n = np.meshgrid(row, row)
            out = m-n
            return out

        x_diff = get_difference_matrix(x_row)
        y_diff = get_difference_matrix(y_row)
        z_diff = get_difference_matrix(z_row)

        frame = np.stack([x_diff, y_diff, z_diff], axis=2)
        frames.append(frame)

    frames = np.stack(frames, axis=0)
    return frames

In [35]:
OUT_PATH = 'dataset/adjacency'
POSE_PATH = 'dataset/pose'

for file in tqdm(glob.glob('dataset/pose/*.npy')):
    if os.path.exists(os.path.join(OUT_PATH, os.path.basename(file).replace('.mp4', '.npy'))):
        # print('Skipping', file)
        continue
    with open(file, 'rb') as f:
        array = np.load(f, allow_pickle=True)
        # replace nan with 0 
        array = np.nan_to_num(array)
    pose_df = pd.DataFrame(array, columns=POSE_DF.columns)
    pose_df = pose_df.replace(np.nan,0)
    MATRICES = get_matrices(pose_df)
    # print(MATRICES.shape)
    np.save(os.path.join(OUT_PATH, os.path.basename(file).replace('.mp4', '.npy')), MATRICES)


  0%|          | 0/3395 [00:00<?, ?it/s]

In [37]:
print(pd.DataFrame(x_diff, columns=[col for col in POSE_DF.columns if col.endswith('_X')], index=[col for col in POSE_DF.columns if col.endswith('_X')]).to_markdown())

|                           |   POSE_NOSE_X |   POSE_LEFT_EYE_INNER_X |   POSE_LEFT_EYE_X |   POSE_LEFT_EYE_OUTER_X |   POSE_RIGHT_EYE_INNER_X |   POSE_RIGHT_EYE_X |   POSE_RIGHT_EYE_OUTER_X |   POSE_LEFT_EAR_X |   POSE_RIGHT_EAR_X |   POSE_MOUTH_LEFT_X |   POSE_MOUTH_RIGHT_X |   POSE_LEFT_SHOULDER_X |   POSE_RIGHT_SHOULDER_X |   POSE_LEFT_ELBOW_X |   POSE_RIGHT_ELBOW_X |   POSE_LEFT_WRIST_X |   POSE_RIGHT_WRIST_X |   POSE_LEFT_PINKY_X |   POSE_RIGHT_PINKY_X |   POSE_LEFT_INDEX_X |   POSE_RIGHT_INDEX_X |   POSE_LEFT_THUMB_X |   POSE_RIGHT_THUMB_X |   POSE_LEFT_HIP_X |   POSE_RIGHT_HIP_X |   POSE_LEFT_KNEE_X |   POSE_RIGHT_KNEE_X |   POSE_LEFT_ANKLE_X |   POSE_RIGHT_ANKLE_X |   POSE_LEFT_HEEL_X |   POSE_RIGHT_HEEL_X |   POSE_LEFT_FOOT_INDEX_X |   POSE_RIGHT_FOOT_INDEX_X |   RIGHT_WRIST_X |   RIGHT_THUMB_CMC_X |   RIGHT_THUMB_MCP_X |   RIGHT_THUMB_IP_X |   RIGHT_THUMB_TIP_X |   RIGHT_INDEX_FINGER_MCP_X |   RIGHT_INDEX_FINGER_PIP_X |   RIGHT_INDEX_FINGER_DIP_X |   RIGHT_INDEX_FINGER_TIP_X

In [None]:
import glob
import numpy as np
from tqdm.notebook import tqdm
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ARRAY_PATH = 'dataset/pose/'
    for datapath in tqdm(glob.glob('dataset/corpus/*.mp4')):
        print(datapath)
        pose = get_pose_estimation(datapath)
        pose_array = get_pose_array(pose)
        print(pose_array.shape, datapath)
        dname = datapath.split('/')[-1].replace('.mp4', '.npy')
        with open(ARRAY_PATH+'/'+dname, 'wb') as f:
            np.save(f, pose_array)

### 2. Graph Autoencoder Training

In [2]:
DATA_PATH = 'dataset/pose/'
data = glob.glob(DATA_PATH + '*.npy')
X_train, X_val = train_test_split(data, test_size=0.2, random_state=42)

In [79]:
train_dataset = PoseDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
)

 27%|██▋       | 743/2716 [00:26<17:39,  1.86it/s]

: 

In [None]:
val_dataset = PoseDataset(X_val)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
)

100%|██████████| 679/679 [00:01<00:00, 534.16it/s]


In [None]:
torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = FFNEncoder(
    input_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_ENCODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_OUTPUT_DIM,
)

MODEL_DECODER = FFNDecoder(
    input_dim=GLOBAL_CONFIG.MODEL_DECODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_DECODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
)

MODEL_QUANT = ResidualVQ(
    dim = GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    stochastic_sample_codes=True,
    num_quantizers=1,      # specify number of quantizers
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_NUM_EMBS,    # codebook size           
    kmeans_init=True,   # set to True
    kmeans_iters=100     # number of kmeans iterations to calculate the centroids for the codebook on init
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT,
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader,
    num_epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    device='cpu',
)

In [44]:
trainer.train()

  1%|          | 1/100 [07:16<11:59:27, 436.03s/it]


***
Epoch:1, 
Train Commitment Loss: 0.003539219268142835, 
Train Reconstruction Loss: 0.025972867930138976, 
Train Quantization Tokens:
|       |   9911 |   1208 |   4962 |   6795 |   5672 |   1154 |   3433 |   2908 |   2910 |   3532 |   4078 |   3310 |   8610 |   9459 |   53 |   2390 |   9294 |   6905 |   2424 |   7484 |   4833 |   3685 |   8569 |   1483 |   1899 |   1235 |   2683 |   435 |   8663 |   5429 |   408 |   715 |   3127 |   3729 |   5414 |   4095 |   8888 |   9779 |   1722 |   8701 |   4058 |   655 |   9551 |   4308 |   9350 |   3633 |   528 |   2183 |   8147 |   1407 |   1613 |   7945 |   2650 |   6755 |   351 |   1469 |   8369 |   2488 |   3422 |   4963 |   6201 |   2174 |   1147 |   119 |   2398 |   9573 |   1126 |   1842 |   4569 |   4444 |   9994 |   7077 |   6244 |   3517 |   6394 |   2975 |   6305 |   6951 |   9248 |   2937 |   5832 |   4786 |   3050 |   7394 |   3432 |   3641 |   4201 |   7151 |   7626 |   7918 |   720 |   9132 |   9227 |   9395 |   9556 |   2188 

  2%|▏         | 2/100 [14:21<11:42:23, 430.04s/it]


***
Epoch:2, 
Train Commitment Loss: 0.004150437284669723, 
Train Reconstruction Loss: 0.005796700465065851, 
Train Quantization Tokens:
|       |   6244 |   4444 |   1722 |   7484 |   6795 |   8888 |   6905 |   2424 |   53 |   9573 |   720 |   1126 |   9248 |   2398 |   8663 |   2650 |   2174 |   6305 |   4078 |   9294 |   1842 |   2529 |   119 |   1154 |   435 |   2683 |   9551 |   351 |   3432 |   3310 |   2188 |   7151 |   9132 |   9994 |   408 |   9459 |   3579 |   5414 |   4201 |   2908 |   8569 |   2183 |   1899 |   1208 |   8369 |   6894 |   9779 |   3532 |   528 |   4715 |   1613 |   1235 |   7077 |   5429 |   7423 |   2910 |   3422 |   7626 |   3633 |   3611 |   2134 |   8610 |   7033 |   3641 |   4095 |   1469 |   3729 |   4833 |   8701 |   6951 |   4046 |   7394 |   2488 |   3517 |   655 |   9350 |   3050 |   4308 |   1483 |   1147 |   715 |   9556 |   9431 |   2506 |   1276 |   6910 |   3762 |   2091 |   5288 |   1758 |   610 |   9808 |   7847 |   264 |   4895 |   1493 | 

  2%|▏         | 2/100 [14:48<12:05:18, 444.06s/it]


KeyboardInterrupt: 

In [None]:
from tqdm.notebook import tqdm

MODEL_VQVAE.eval()

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        quantized, indices, commitment_loss = MODEL_VQVAE(train_sample['array'].float())
        dfs.append(pd.DataFrame({
            'videos': train_sample['token'],
            'labels': indices.detach().cpu().numpy().reshape(-1),
            'frame': train_sample['frame'].detach().cpu().numpy().reshape(-1)
        }))

In [None]:
df = pd.concat(dfs)

###  3. 3D-CNN Training 

In [8]:
DATA_PATH = 'dataset/adjacency/'
data = glob.glob(DATA_PATH + '*.npy')[:10]
X_train, X_val = train_test_split(data, test_size=0.2, random_state=42)

In [9]:
train_dataset = PoseDistanceDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
    collate_fn=PoseDistanceDataset.collate_fn
)

100%|██████████| 8/8 [00:00<00:00, 84.28it/s]


In [10]:
val_dataset = PoseDistanceDataset(X_val)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
    collate_fn=PoseDistanceDataset.collate_fn
)

100%|██████████| 2/2 [00:00<00:00, 53.07it/s]


In [11]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = CNNEncoder(
    input_channels=3,
)

MODEL_DECODER = CNNDecoder(
    output_channels=3,
)

MODEL_QUANT = ResidualVQ(
    dim = GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    stochastic_sample_codes=True,
    num_quantizers=1,      # specify number of quantizers
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_NUM_EMBS,    # codebook size           
    kmeans_init=True,   # set to True
    kmeans_iters=10     # number of kmeans iterations to calculate the centroids for the codebook on init
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT,
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader,
    num_epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    device='cpu',
)

In [12]:
trainer.train()

  1%|          | 1/100 [00:30<51:05, 30.97s/it]


***
Epoch:1, 
Train Commitment Loss: 0.00020516358482670894, 
Train Reconstruction Loss: 0.3308721814424761, 
Train Quantization Tokens:
|       |   644 |   746 |   668 |   660 |   45 |   298 |   676 |   681 |   330 |   354 |   592 |   696 |   846 |   393 |   250 |   415 |   472 |   831 |   524 |   294 |   549 |   762 |   649 |   193 |   188 |   260 |   567 |   968 |   630 |   884 |   291 |   277 |   634 |   988 |   745 |   394 |   258 |   129 |   581 |   901 |   175 |   810 |   672 |   359 |   32 |   184 |   950 |   283 |   413 |   758 |   12 |   928 |   235 |   669 |   834 |   339 |   596 |   162 |   387 |   497 |   466 |   748 |   803 |   627 |   816 |   348 |   218 |   493 |   888 |   666 |   35 |   88 |   875 |   961 |   385 |   967 |   476 |   650 |   342 |   709 |   811 |   18 |   929 |   15 |   931 |   738 |   284 |   777 |   525 |   855 |   232 |   380 |   747 |   24 |   108 |   326 |   306 |   482 |   807 |   371 |   57 |   372 |   485 |   406 |   195 |   842 |   984 |   351

  1%|          | 1/100 [00:49<1:22:06, 49.76s/it]


KeyboardInterrupt: 

In [17]:
from tqdm.notebook import tqdm

MODEL_VQVAE.eval()

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        quantized, indices, commitment_loss = MODEL_VQVAE(train_sample['array'].float())
        # print(indices)
        # print(indices.shape)
        # print(indices.detach().cpu().numpy().reshape(-1))
        dfs.append(pd.DataFrame({
            'videos': train_sample['tokens'],
            'labels': indices.detach().cpu().numpy().reshape(-1),
            'start_idx': train_sample['start_idx'],
            'end_idx': train_sample['end_idx']
        }))

  0%|          | 0/62 [00:00<?, ?it/s]

In [18]:
df = pd.concat(dfs)

In [19]:
df

Unnamed: 0,videos,labels,start_idx,end_idx
0,C╠ğIG╠åNEMEK_0,6,17,22
1,SADE_1,6,38,43
2,C╠ğIG╠åNEMEK_0,6,97,102
3,YABANCI_1,6,23,28
4,EMEK_2,6,13,18
...,...,...,...,...
5,DEG╠åER_1,6,36,41
6,SADE_1,6,7,12
7,C╠ğIG╠åNEMEK_0,6,67,72
0,C╠ğIG╠åNEMEK_0,6,62,67


In [20]:
df.labels.value_counts()

labels
6    490
Name: count, dtype: int64

In [67]:
for rec in tqdm(df[df['labels'] == 6].to_dict(orient='records')):
    # save frame video to disk
    video = rec['videos'].split('.')[0]
    video_path = f"dataset/corpus/{video}.mp4"
    frame_idx = rec['frame']
    label = rec['labels']
    import cv2 
    import numpy as np

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    
    import os
    if not os.path.exists(f'analyze/quantization/{label}'):
        os.mkdir(f'analyze/quantization/{label}')

    for i in range(frame_idx):
        ret, frame = cap.read()
        if i == frame_idx-1:
            cv2.imwrite(f'analyze/quantization/{label}/{video}_{frame_idx}.jpg', frame)   
            # print(f'analyze/quantization/{label}/{video}_{frame_idx}.jpg')         

  0%|          | 0/250 [00:00<?, ?it/s]