In [None]:
!pip install -r requirements.txt
!pip install opencv-python
!pip install mediapipe

## Main

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from lib.config import *

import vector_quantize_pytorch as vq
from lib.encoder.vqvae import VQVAE_POSE
from lib.utils.dataset import get_dataset
from lib.train.autoencoder import AutoTrainer
from lib.data.dataset import PoseDistanceDataset, PoseDataset
from lib.encoder.cnn import CNN3dEncoder, CNN3dDecoder

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = CNN3dEncoder(
    model_name=GLOBAL_CONFIG.MODEL_ENCODER_NAME,
    conv_layers=GLOBAL_CONFIG.MODEL_ENCODER_CONVOLUTIONAL_LAYERS,
    linear_layers=GLOBAL_CONFIG.MODEL_ENCODER_LINEAR_LAYERS,
    out_channels=GLOBAL_CONFIG.MODEL_ENCODER_OUT_CHANNEL,
    input_size=GLOBAL_CONFIG.INPUT_DIM,
    output_size=GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    channel_size=GLOBAL_CONFIG.INPUT_CHANNELS,
    depth_size=GLOBAL_CONFIG.FRAME_WINDOW,
    log=False,
)

MODEL_DECODER = CNN3dDecoder(
    model_name=GLOBAL_CONFIG.MODEL_DECODER_NAME,
    linear_layers=GLOBAL_CONFIG.MODEL_DECODER_LINEAR_LAYERS,
    conv_transpose_layers=GLOBAL_CONFIG.MODEL_DECODER_CONVOLUTIONAL_LAYERS,
    in_channels=GLOBAL_CONFIG.MODEL_ENCODER_OUT_CHANNEL,
    linear_input=GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    input_size=MODEL_ENCODER.output_size, # This is for reshaping into encoder before linear layers
    output_size=(GLOBAL_CONFIG.INPUT_CHANNELS, GLOBAL_CONFIG.FRAME_WINDOW, GLOBAL_CONFIG.INPUT_DIM[3], GLOBAL_CONFIG.INPUT_DIM[4]),
    log=False
)

VQVAE = vq.ResidualVQ(
    dim=GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_VOCAB,
    num_quantizers=GLOBAL_CONFIG.MODEL_VQ_CODEBOOK,
    codebook_dim=GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
)

MODEL_VQVAE = VQVAE_POSE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq_vae=VQVAE,
)

In [None]:
train_dataset, eval_dataset = get_dataset(
    DATASET_PATH='dataset/adjacency/',
    DATASET_EXTENSION='.npy',
    DATASET_ENCODING='utf-8',
    DATA_DISTRIBUTION='80-20',
    DATASET_CONFIG={
        'window': GLOBAL_CONFIG.FRAME_WINDOW,
        'depth': GLOBAL_CONFIG.INPUT_CHANNELS
    },
    RANDOM_STATE=42
)

In [None]:
EXPERIMENT_PATH = 'experiments'

In [None]:
import os
from datetime import datetime
EXPERIMENT_NAME = str(datetime.now())
os.mkdir(f'{EXPERIMENT_PATH}/{EXPERIMENT_NAME}')
os.mkdir(f'{EXPERIMENT_PATH}/{EXPERIMENT_NAME}/model')
os.mkdir(f'{EXPERIMENT_PATH}/{EXPERIMENT_NAME}/logs')
EXPERIMENT_PATH = EXPERIMENT_PATH + '/' + EXPERIMENT_NAME

In [None]:
trainer = AutoTrainer(
    model=MODEL_VQVAE,
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    batch_size=GLOBAL_CONFIG.BATCH_SIZE,
    epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    step_size=GLOBAL_CONFIG.STEP_SIZE,
    gamma=GLOBAL_CONFIG.GAMMA,
    device='cpu',
    start_epoch=0,
    num_codebooks=GLOBAL_CONFIG.MODEL_VQ_CODEBOOK,
    model_path=f'{EXPERIMENT_PATH}/model/model-0.pt',
    log_dir=f'{EXPERIMENT_PATH}/logs/logs.json'
)

trainer.train()

In [None]:
# Infer model out
from lib.utils.infer import *

df = get_quantization(MODEL_VQVAE, eval_dataset)

dump_quantization(
    df, 
    num_quantizers=GLOBAL_CONFIG.MODEL_VQ_CODEBOOK, 
    video_path='dataset/corpus', 
    quantization_path='analyze/quantization'
)

### 1. Pose Inference

In [None]:
SAMPLE = 'dataset/corpus/ABARTMAK_0.mp4'
SAMPLE_POSE = get_pose_estimation(SAMPLE)

In [None]:
import pandas as pd

def get_pose_array(SAMPLE_POSE):
    """Converts the pose data into a numpy array
    """

    POSE_RAW = pd.DataFrame(SAMPLE_POSE['pose'])
    RIGHT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['right'])
    LEFT_HAND_RAW = pd.DataFrame(SAMPLE_POSE['left'])

    POSE_DF = {}

    for col in POSE_RAW.columns:
        POSE_DF[ 'POSE_' + col + '_X'] = POSE_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'POSE_' + col + '_Y'] = POSE_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'POSE_' + col + '_Z'] = POSE_RAW[col].apply(lambda x: x[2])
        # POSE_DF[col + '_viz'] = POSE_RAW[col].apply(lambda x: x[3])

    for col in RIGHT_HAND_RAW.columns:
        POSE_DF[ 'RIGHT_' + col + '_X' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'RIGHT_' + col + '_Y' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'RIGHT_' + col + '_Z' ] = RIGHT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['RIGHT_' + col + '_viz'] = RIGHT_HAND_RAW[col].apply(lambda x: x[3])

    for col in LEFT_HAND_RAW.columns:
        POSE_DF[ 'LEFT_' + col + '_X' ] = LEFT_HAND_RAW[col].apply(lambda x: x[0])
        POSE_DF[ 'LEFT_' + col + '_Y' ] = LEFT_HAND_RAW[col].apply(lambda x: x[1])
        POSE_DF[ 'LEFT_' + col + '_Z' ] = LEFT_HAND_RAW[col].apply(lambda x: x[2])
        # POSE_DF['LEFT_' + col + '_viz'] = LEFT_HAND_RAW[col].apply(lambda x: x[3])

    POSE_DF = pd.DataFrame(POSE_DF)

    return POSE_DF

In [None]:
POSE_DF = get_pose_array(SAMPLE_POSE[0])

In [None]:
POSE_DF.columns.to_list()

In [None]:
POSE_DF = POSE_DF.replace(np.nan,0)

In [None]:
def get_matrices(POSE_DF):
    """Converts the pose data into a numpy array of distance matrices
    """
    x_cols = [col for col in POSE_DF.columns if col.endswith('_X')]
    y_cols = [col for col in POSE_DF.columns if col.endswith('_Y')]
    z_cols = [col for col in POSE_DF.columns if col.endswith('_Z')]

    frames = []
    for i in range(1, POSE_DF.shape[0]):
        x_row = POSE_DF[x_cols].iloc[i].to_numpy()
        y_row = POSE_DF[y_cols].iloc[i].to_numpy()
        z_row = POSE_DF[z_cols].iloc[i].to_numpy()

        def get_difference_matrix(row):
            m, n = np.meshgrid(row, row)
            out = m-n
            return out

        x_diff = get_difference_matrix(x_row)
        y_diff = get_difference_matrix(y_row)
        z_diff = get_difference_matrix(z_row)

        frame = np.stack([x_diff, y_diff, z_diff], axis=2)
        frames.append(frame)

    frames = np.stack(frames, axis=0)
    return frames

In [None]:
OUT_PATH = 'dataset/adjacency'
POSE_PATH = 'dataset/pose'

for file in tqdm(glob.glob('dataset/pose/*.npy')):
    if os.path.exists(os.path.join(OUT_PATH, os.path.basename(file).replace('.mp4', '.npy'))):
        # print('Skipping', file)
        continue
    with open(file, 'rb') as f:
        array = np.load(f, allow_pickle=True)
        # replace nan with 0 
        array = np.nan_to_num(array)
    pose_df = pd.DataFrame(array, columns=POSE_DF.columns)
    pose_df = pose_df.replace(np.nan,0)
    MATRICES = get_matrices(pose_df)
    # print(MATRICES.shape)
    np.save(os.path.join(OUT_PATH, os.path.basename(file).replace('.mp4', '.npy')), MATRICES)


In [None]:
print(pd.DataFrame(x_diff, columns=[col for col in POSE_DF.columns if col.endswith('_X')], index=[col for col in POSE_DF.columns if col.endswith('_X')]).to_markdown())

In [None]:
import glob
import numpy as np
from tqdm.notebook import tqdm
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ARRAY_PATH = 'dataset/pose/'
    for datapath in tqdm(glob.glob('dataset/corpus/*.mp4')):
        print(datapath)
        pose, _ = get_pose_estimation(datapath)
        pose_array = get_pose_array(pose)
        print(pose_array.shape, datapath)
        dname = datapath.split('/')[-1].replace('.mp4', '.npy')
        with open(ARRAY_PATH+'/'+dname, 'wb') as f:
            np.save(f, pose_array)

### 2. Graph Autoencoder Training

In [None]:
DATA_PATH = 'dataset/pose/'
data = glob.glob(DATA_PATH + '*.npy')
X_train, X_val = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train_dataset = PoseDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
)

In [None]:
val_dataset = PoseDataset(X_val)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
)

In [None]:
torch.cuda.is_available()

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = FFNEncoder(
    input_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_ENCODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_OUTPUT_DIM,
)

MODEL_DECODER = FFNDecoder(
    input_dim=GLOBAL_CONFIG.MODEL_DECODER_INPUT_DIM,
    hidden_dim=GLOBAL_CONFIG.MODEL_DECODER_HIDDEN_DIM,
    output_dim=GLOBAL_CONFIG.MODEL_ENCODER_INPUT_DIM,
)

MODEL_QUANT = ResidualVQ(
    dim = GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    stochastic_sample_codes=True,
    num_quantizers=1,      # specify number of quantizers
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_NUM_EMBS,    # codebook size           
    kmeans_init=True,   # set to True
    kmeans_iters=100     # number of kmeans iterations to calculate the centroids for the codebook on init
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT,
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader,
    num_epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    device='cpu',
)

In [None]:
trainer.train()

In [None]:
from tqdm.notebook import tqdm

MODEL_VQVAE.eval()

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        quantized, indices, commitment_loss = MODEL_VQVAE(train_sample['array'].float())
        dfs.append(pd.DataFrame({
            'videos': train_sample['token'],
            'labels': indices.detach().cpu().numpy().reshape(-1),
            'frame': train_sample['frame'].detach().cpu().numpy().reshape(-1)
        }))

In [None]:
df = pd.concat(dfs)

In [None]:
import cv2
for rec in tqdm(df[df['labels'] == 375].to_dict(orient='records')[:100]):
    # save frame video to disk
    video = rec['videos'].split('.')[0]
    video_path = f"dataset/corpus/{video}.mp4"
    frame_idx = rec['frame']
    label = rec['labels']
    
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    
    if not os.path.exists(f'analyze/quantization/{label}'): os.mkdir(f'analyze/quantization/{label}')

    for i in range(frame_idx):
        ret, frame = cap.read()
        if i == frame_idx-1:
            cv2.imwrite(f'analyze/quantization/{label}/{video}_{frame_idx}.jpg', frame)   

###  3. 3D-CNN Training 

In [None]:
DATA_PATH = 'dataset/adjacency/'
data = glob.glob(DATA_PATH + '*.npy')[:100]
X_train, X_val = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train_dataset = PoseDistanceDataset(X_train)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
    collate_fn=PoseDistanceDataset.collate_fn
)

In [None]:
val_dataset = PoseDistanceDataset(X_val)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=GLOBAL_CONFIG.BATCH_SIZE, 
    shuffle=True,
    collate_fn=PoseDistanceDataset.collate_fn
)

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_ENCODER = CNNEncoder(
    input_channels=3,
)

MODEL_DECODER = CNNDecoder(
    output_channels=3,
)

MODEL_QUANT = ResidualVQ(
    dim = GLOBAL_CONFIG.MODEL_VQ_EMBED_DIM,
    stochastic_sample_codes=True,
    num_quantizers=1,      # specify number of quantizers
    codebook_size=GLOBAL_CONFIG.MODEL_VQ_NUM_EMBS,    # codebook size           
    kmeans_init=True,   # set to True
    kmeans_iters=10     # number of kmeans iterations to calculate the centroids for the codebook on init
)

MODEL_VQVAE = VQVAE(
    encoder=MODEL_ENCODER,
    decoder=MODEL_DECODER,
    vq=MODEL_QUANT,
)

trainer = AutoencoderTrainer(
    model=MODEL_VQVAE,
    learning_rate=GLOBAL_CONFIG.LEARNING_RATE,
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader,
    num_epochs=GLOBAL_CONFIG.NUM_EPOCHS,
    device='cpu',
)

In [None]:
trainer.train()

In [None]:
from tqdm.notebook import tqdm

MODEL_VQVAE.eval()

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=10, 
    shuffle=True,
    collate_fn=train_dataset.collate_fn   
)

dfs = []
for train_sample in tqdm(train_dataloader):
    with torch.no_grad():
        quantized, indices, commitment_loss = MODEL_VQVAE(train_sample['array'].float())

        quant = {
            'videos': train_sample['tokens'],
            'start_idx': train_sample['start_idx'],
            'end_idx': train_sample['end_idx']
        }
        

        for index in range(indices.shape[1]):
            quant[f'Code_{index}'] = indices[:, index].cpu().numpy()


        dfs.append(pd.DataFrame(quant))

In [None]:
df = pd.concat(dfs)

In [None]:
import json
LOG_IDX=45
with open(f'analyze/quantization/experimental_logs/logs-{LOG_IDX}.json', 'r') as f:
    corpus = json.load(f)

In [None]:
cls = corpus['train']['commit-loss']
import numpy as np
for i in range(len(cls)):
    if cls[str(i)]:
        print(i,np.sum(cls[str(i)]))

In [None]:
val = corpus['validation']
df = {}
for keys in ['vocab', 'start_idx', 'end_idx', 'quantization']:
    print(keys)
    print(val[keys][f'{LOG_IDX}'])
    if keys == 'quantization':
        for code in val[keys][f'{LOG_IDX}']:
            df[code] = val[keys][f'{LOG_IDX}'][code]
    else:
        df[keys] = val[keys][f'{LOG_IDX}']

In [None]:
import pandas as pd
df = pd.DataFrame(df)
df.start_idx = df.start_idx.astype(int)
df.end_idx = df.end_idx.astype(int)

In [None]:
df.Code_0.value_counts()

In [None]:
df.Code_1.value_counts().head(10)

In [None]:
import cv2
import pandas as pd
from tqdm.notebook import tqdm
from moviepy.editor import VideoFileClip

CODEBOOK = 'Code_1'
CODE_ID = 606           

for rec in tqdm(df[df[CODEBOOK] == CODE_ID].to_dict(orient='records')):
    # save frame video to disk
    video = rec['vocab']
    video_path = f"dataset/corpus/{video}.mp4"
    start_idx = rec['start_idx']
    end_idx = rec['end_idx']
    label = str(rec['Code_1']) + '-' + str(rec['Code_0'])

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    
    import os
    if not os.path.exists(f'analyze/quantization/{label}'):
        os.mkdir(f'analyze/quantization/{label}')

    FRAMES = []
    for i in range(end_idx+1):
        ret, frame = cap.read()
        if i >= start_idx and i < end_idx:
            FRAMES.append(frame)

    # write frames to video
    out = cv2.VideoWriter(f'analyze/quantization/{label}/{video}_{start_idx}_{end_idx}.avi', cv2.VideoWriter_fourcc(*'DIVX'), 15, (frame.shape[1], frame.shape[0]))
    for frame in FRAMES:
        out.write(frame)

    out.release()      

    videoClip = VideoFileClip(f"analyze/quantization/{label}/{video}_{start_idx}_{end_idx}.avi")
    videoClip.write_gif(f"analyze/quantization/{label}/{video}_{start_idx}_{end_idx}.gif")

    os.remove(f"analyze/quantization/{label}/{video}_{start_idx}_{end_idx}.avi")