# 2D to 3D LSTM

This is the first approach to try to estimate 3D points coordinates from 2D keypoints extracted with Openpose. Here I will build a simple LSTM to perform the task over the Panoptic Studio dataset.

First, let's import all the necessary libraries.

In [1]:
# Pytorch utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from ignite.metrics import Accuracy

from scipy.spatial.transform import Rotation as R
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Plotting utilities
%matplotlib widget
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import HTML
import matplotlib.animation as animation
from torch.utils.tensorboard import SummaryWriter
from timeit import default_timer as timer
import pyprind

# Directory and file utilities
from os import listdir
from os.path import isfile, isdir, join
import json

## Notebook parameters

In [2]:
modes = { 0:'one video', 1:'one signer', 2:'all signers'}
MODE =  modes[0]
video = 3

## Data acquisition
Now I will define some functions in order to parse and organise the data, and later convert it to pytorch tensors.

The data is structured as follows: in the dataset directory there are several folders, each folder corresponds to a recording; each of these folders contains a folder with the audio, folders with face, hands and body keypoints estimations for each frame, and a folder with the video recorded from different views.

In this first approach I will be using the keypoints estimations. Every keypoint folder (face, hands or body) is organized the same way: it contains a json per frame of the video, which includes the 3D keypoints estimation.

The function `get_keypoints` will go through each folder in the dataset directory and retrieve the face keypoints, the hands keypoints and the body keypoints. It will separate them into input (2D coordinates per joint per frame) and grountruth (third coordinate to estimate for each input 2D keypoint). 
The input will be of shape $([n videos, seq len, input size])$, where *seq_len* = number of frames, and *input_size* = face + hands + body keypoints, that is (70+(21+21)+26)x2 -multiplied by 2 because there are x and y coordinates-. The groundtruth (label) data will be of the same shape, except that the last dimension size will not be multiplied by 2 (there's only one coordinate to estimate).  

In [3]:
def get_keypoints(data_path):
    dataset = []
    groundtruth = []
    # Look over just the folders inside the directory
    just_folders = filter(lambda x: isdir(join(data_path, x)), listdir(data_path))
    for p in list(map(lambda x: join(data_path, x), just_folders)): 
        # Gets 2 list of n_frames lists, one for the 2D coordinates and one for the third coordinate.
        # Each list of the n_frames lists contains, either the (x and y) or the z of each keypoint for the face(first line), hands(second), body(third).
        # e.g. the first line will result in [[x1,y1,x2,y2...x70,y70]sub1...[x1,y1...x70,y70]subN], [[z1,z2...z70]sub1...[z1..z70]subN]
        # Actually, as there will be two of each list above because there are two people en each video.
        face_2d, face_3d = get_face(p)
        hands_2d, hands_3d = get_hands(p)
        pose_2d, pose_3d = get_body(p)
        
        # Concatenates the coordinates for the face, hands and body on the last dimension, for each person.
        vid_input_p1, vid_input_p2 = ([fa+ha+po for fa, ha, po in zip(face_2d[i], hands_2d[i], pose_2d[i])] for i in range(2))
        vid_labels_p1, vid_labels_p2 = ([fa+ha+po for fa, ha, po in zip(face_3d[i], hands_3d[i], pose_3d[i])] for i in range(2))
        
        dataset.append(vid_input_p1)
        dataset.append(vid_input_p2)
        groundtruth.append(vid_labels_p1)
        groundtruth.append(vid_labels_p2)
        print(f'Completed folder {p}')
    return dataset, groundtruth

The following functions are in charge of retrieving the keypoints from each json. The json face json has a key *people* with a list of person objects. Each person object has *id* field and *landmarks* field, the latter containing a list of 3D coordinates for each keypoint.

In [4]:
def get_face(path):
    face_2D_seq = ([], [])
    face_3D_seq = ([], [])
    # List only the files (json), for there might be folders containing invalid frames.
    paths = map(lambda x: join(path, 'hdFace3d', x), sorted(listdir(join(path, 'hdFace3d'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files: # The first frame of face keypoints estimation it's blank
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['people']:
                if person['id'] != -1: # If the id is -1, it means there's no person
                    x = person['face70']['landmarks'][::3]
                    y = person['face70']['landmarks'][1::3]
                    two_coord = [l[item] for item in range(len(x)) for l in [x,y]]
                    third_coord = person['face70']['landmarks'][2::3]
                    face_2D_seq[person['id']].append(two_coord)
                    face_3D_seq[person['id']].append(third_coord)
                    i+=1
                    pid = person['id']
            if i<2: # In case there was only one person detected on a frame
                face_2D_seq[1-pid].append(face_2D_seq[1-pid][-1])
                face_3D_seq[1-pid].append(face_3D_seq[1-pid][-1])
    print('Face completed.')
    # Each return var being a tuple with the list of n_frames list of coordinates for each person
    return face_2D_seq, face_3D_seq

The hands json contains the *landmarks* field inside both *left_hand* and *right_hand* field. As there are some frames that may not have one of the hands estimated, I've had to put some exception handling. 

In [5]:
def get_hands(path):
    hand_2D_seq = ([], [])
    hand_3D_seq = ([], [])
    paths = map(lambda x: join(path, 'hdHand3d', x), sorted(listdir(join(path, 'hdHand3d'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files[1:-1]: # The first and the last frames of these folders are blank.
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['people']:
                if person['id'] != -1:
                    try:
                        # Separate x,y from z
                        hands= [[person[hand]['landmarks'][c] 
                                 for c in range(len(person['right_hand']['landmarks'])) if (c+1)%3!=0] 
                                 for hand in ['left_hand', 'right_hand']]
                        hand_2D_seq[person['id']].append(hands[0]+hands[1])
                        
                        hands_3d = [person[hand]['landmarks'][2::3] 
                                    for hand in ['left_hand', 'right_hand']]
                        hand_3D_seq[person['id']].append(hands_3d[0]+hands_3d[1])
                    
                    except Exception as e: # In case left_hand or right_hand keys don't exist.
                        if 'left_hand' in str(e): 
                            try: # Just put a 0., 0., 0. estimation for each keypoint of the left_hand
                                hands = hand_2D_seq[1-person['id']][-1][:42]+[person['right_hand']['landmarks'][c] 
                                                                 for c in range(len(person['right_hand']['landmarks'])) if (c+1)%3!=0]
                                hands_3d = hand_3D_seq[1-person['id']][-1][:21]+person['right_hand']['landmarks'][2::3]
                            except: # In case neither left_hand nor right_hand exist
                                hands = hand_2D_seq[1-person['id']][-1]
                                hands_3d = hand_3D_seq[1-person['id']][-1]
                        elif 'right_hand' in str(e): # Just put a 0., 0., 0. estimation for each keypoint of the right_hand
                            hands = [person['left_hand']['landmarks'][c] 
                                     for c in range(len(person['left_hand']['landmarks'])) if (c+1)%3!=0]+hand_2D_seq[1-person['id']][-1][42:]
                            hands_3d = person['left_hand']['landmarks'][2::3]+hand_3D_seq[1-person['id']][-1][21:]

                        hand_2D_seq[person['id']].append(hands)
                        hand_3D_seq[person['id']].append(hands_3d)
                    i+=1
                    pid = person['id']
            if i<2:
                hand_2D_seq[1-pid].append(hand_2D_seq[1-pid][-1])
                hand_3D_seq[1-pid].append(hand_3D_seq[1-pid][-1])
    print('Hands completed.')
    return hand_2D_seq, hand_3D_seq

The body json is organised a bit differently, inside each person object contains the *joints26* field with a list of 3D coordinates. But this list is structured as follows: *[x1,y1,z1,acc1,x2,y2,z2,acc2...]*.

In [6]:
def get_body(path):
    body_2D_seq = ([], [])
    body_3D_seq = ([], [])
    paths = map(lambda x: join(path, 'hdPose3d_stage1_op25', x), sorted(listdir(join(path, 'hdPose3d_stage1_op25'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files[:-1]:
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['bodies']:
                if person['id'] != -1:
                    x = person['joints26'][::4]
                    y = person['joints26'][1::4]
                    two_coord = [l[item] for item in range(len(x)) for l in [x,y]]
                    third_coord = person['joints26'][2::4]
                    body_2D_seq[person['id']].append(two_coord)
                    body_3D_seq[person['id']].append(third_coord)
                    i += 1
                    pid = person['id']
            if i<2:
                body_2D_seq[1-pid].append(body_2D_seq[1-pid][-1])
                body_3D_seq[1-pid].append(body_3D_seq[1-pid][-1])
    print('Body completed.')
    return body_2D_seq, body_3D_seq

In [6]:
data_path = '../../../data/DB keypoints'
dataset, groundtruth = get_keypoints(data_path)

Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl2
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl4
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl5
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl1
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl2
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl3
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl5
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl7
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl9
Face completed.
Hands completed.
Body completed.
Comple

Add padding to shorter sequences and convert them to numpy arrays.

In [7]:
def padding_seq(dataset):
    max_seq = max([len(x) for x in dataset])
    seq_lengths = []
    for seq in dataset:
        seq_lengths.append(len(seq))
        for i in range(max_seq-len(seq)):
            seq.append([np.nan for j in range(len(seq[0]))])
        
    return max_seq, seq_lengths

max_seq, seq_lengths = padding_seq(dataset)
_, _ = padding_seq(groundtruth)

In [8]:
# From python lists to numpy ndarray.
dataset = np.asarray(dataset)
groundtruth = np.asarray(groundtruth)
lengths = np.asarray(seq_lengths)
print(dataset.shape, groundtruth.shape, lengths.shape)

(36, 8751, 276) (36, 8751, 138) (36,)


Once they are numpy ndarrays I save the keypoints into pickle files for faster loading in later executions

In [10]:
np.save('../../pickles/dataset.npy', dataset)
np.save('../../pickles/groundtruth.npy', groundtruth)

In [12]:
np.save('../../pickles/lengths.npy', lengths)

## Dataset structuring
Now let's convert the lists obtained to Pytorch tensors and organise them in train, validation and test datasets. 
First, I will define a padding function in order to make all the sequences of video frames the same length, so I can train the LSTM in batches. 

#### Load from pickle
Load keypoints from pre-saved pickle files instead of directly reading the jsons, can be found in below cells

In [3]:
dataset = np.load('../../pickles/body_data.npy', allow_pickle=True)
groundtruth = np.load('../../pickles/body_ground.npy', allow_pickle=True)
lengths = np.load('../../pickles/lengths.npy', allow_pickle=True)
max_seq = dataset.shape[1]
print(max_seq)

8752


In [4]:
print(groundtruth[2,3])

[-113.151  -110.343  -109.332   -97.1843  -90.6559  -80.3361  -99.0535
 -101.455  -104.875  -130.709  -136.555  -115.679  -119.611  -123.231
 -129.412  -107.816  -105.321  -114.424  -121.72    -98.6385  -95.802
 -107.165  -124.468  -129.853  -128.215  -113.049 ]


In [5]:
if MODE == 'one video':
    dataset, groundtruth = dataset[video], groundtruth[video]
    chunks_d = np.split(dataset, 547, axis=0)[:lengths[video]//16]
    chunks_g = np.split(groundtruth, 547, axis=0)[:lengths[video]//16]
    chunks_d = tuple(np.expand_dims(c, axis=0) for c in chunks_d)
    chunks_g = tuple(np.expand_dims(c, axis=0) for c in chunks_g)
    
    dataset = np.concatenate(chunks_d, axis=0)
    groundtruth = np.concatenate(chunks_g, axis=0)
    lengths = np.asarray([16 for i in range(lengths[video]//16)])
    print(lengths.shape)

elif MODE == 'one signer':
    dataset, groundtruth, lengths = dataset[20::2, :-2], groundtruth[20::2, :-2], lengths[20::2]
    print(lengths)
    lengths = np.asarray([min(8750, x) for x in lengths])
    print(lengths)
    chunks_d, chunks_g = np.split(dataset, 35, axis=1), np.split(groundtruth, 35, axis=1)
    
    dataset, groundtruth = np.concatenate(chunks_d, axis=0), np.concatenate(chunks_g, axis=0)
    lengths = np.concatenate(tuple([250 if j*250<=lengths[i] 
                                    else (lengths[i]%250 if (j-1)*250<lengths[i] 
                                          else 0) for i in range(12)] for j in range(1,36)), axis=0)
    print(dataset.shape, groundtruth.shape, lengths.shape)
    
elif MODE == 'all signers':
    dataset, groundtruth = dataset[:, :-2], groundtruth[:, :-2]
    lengths = np.asarray([min(8750, x) for x in lengths])
    chunks_d, chunks_g = np.split(dataset, 25, axis=1), np.split(groundtruth, 25, axis=1)
    
    dataset, groundtruth = np.concatenate(chunks_d, axis=0), np.concatenate(chunks_g, axis=0)
    lengths = np.concatenate(tuple([350 if j*350<=lengths[i]
                                    else (lengths[i]%350 if (j-1)*350<lengths[i] 
                                          else 0) for i in range(44)] for j in range(1,26)), axis=0)
    print(dataset.shape, groundtruth.shape, lengths.shape)

(517,)


For each axis I normalize the keypoints using the following formula:

In [6]:
# Clean all NaN videos
dataset = np.delete(dataset, np.where(lengths==0), axis=0)
groundtruth = np.delete(groundtruth, np.where(lengths==0), axis=0)
lengths = np.delete(lengths, np.where(lengths==0), axis=0)

print(dataset.shape, groundtruth.shape, lengths.shape)

(517, 16, 52) (517, 16, 26) (517,)


In [7]:
def align(tensor, coordinates=1):
    for n_vid in range(tensor.shape[0]):
        max_value = [np.nanmax(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        min_value = [np.nanmin(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        center = [(max_value[i]+min_value[i])/2 for i in range(coordinates)]
        for j in range(coordinates):
            subtensor = tensor[n_vid, :, j::coordinates]
            subtensor[:] = np.subtract(subtensor, center[j])

align(dataset,2)
align(groundtruth)
print(dataset.shape, groundtruth.shape, lengths.shape)

(517, 16, 52) (517, 16, 26) (517,)


In [8]:
r = R.from_euler('y', 110, degrees=True)
shapes = dataset.shape
dataset = dataset.reshape(-1, 2)
groundtruth = groundtruth.reshape(-1,1)
xyz = np.concatenate((dataset, groundtruth), axis=1)
xyz = r.apply(xyz)
dataset, groundtruth = xyz[:, :2].reshape(shapes), xyz[:,2].reshape(shapes[0], shapes[1], 26)

print(dataset.shape, groundtruth.shape)

(517, 16, 52) (517, 16, 26)


In [9]:
def norm_uniform(tensor, coordinates=1, factor=None):
    scale = []
    mean_ranges = []
    for n_vid in range(tensor.shape[0]):
        coord_scale = []
        max_value = [np.nanmax(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        min_value = [np.nanmin(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        center = [(max_value[i]+min_value[i])/2 for i in range(coordinates)]
        ranges = np.ndarray((tensor.shape[1],coordinates))
        for n_frame in range(tensor.shape[1]):
            rang = [np.nanmax(tensor[n_vid, n_frame,i::coordinates])-np.nanmin(tensor[n_vid, n_frame,i::coordinates]) for i in range(coordinates)]
            ranges[n_frame] = np.asarray(rang)
        mean_range = [np.nanmean(ranges[:,i]) for i in range(coordinates)]
        for j in range(coordinates):
            subtensor = tensor[n_vid, :, j::coordinates]
            subtensor[:] = np.subtract(subtensor, center[j])
            if factor is not None:
                subtensor[:] = np.divide(subtensor, factor[n_vid])
            else:
                subtensor[:] = np.divide(subtensor, max_value[j]-center[j])
            coord_scale.append((max_value[j]-center[j] if factor is None else factor[n_vid]))
        scale.append(coord_scale)
        mean_ranges.append(mean_range)
    return mean_ranges
input_scale = np.asarray(norm_uniform(dataset,2)).squeeze()
print(input_scale.shape)

(517, 2)


In [10]:
def normalize(tensor, coordinates=1, std=None):
    moments = []
    std_centroids = []
    for n_vid in range(tensor.shape[0]):
        coord_moments = []
        mean_value = [np.nanmean(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        std_value = [np.nanstd(tensor[n_vid, :,i::coordinates]) for i in range(coordinates)]
        centroids = np.ndarray((tensor.shape[1],coordinates))
        for n_frame in range(tensor.shape[1]):
            centroid = [np.nanmean(tensor[n_vid, n_frame, i::coordinates]) for i in range(coordinates)]
            centroids[n_frame] = np.asarray(centroid)
        std_centroid = [np.nanstd(centroids[:,i]) for i in range(coordinates)]
        if std is not None:
            std_value = [std[n_vid]]
        for j in range(coordinates):
            subtensor = tensor[:, :, j::coordinates]
            subtensor[:] = np.subtract(subtensor, mean_value[j])
            subtensor[:] = np.divide(subtensor, std_value[j])
            coord_moments.append((mean_value[j], std_value[j]))
        moments.append(coord_moments)
        std_centroids.append(std_centroid)
    return moments, std_centroids

moments, std_centroids = normalize(dataset, 2)

In [11]:
# Randomly shuffle videos
permutation = np.random.permutation(dataset.shape[0])
dataset, groundtruth, lengths = dataset[permutation], groundtruth[permutation], lengths[permutation]
input_scale = input_scale[permutation]

print(dataset.shape, groundtruth.shape, lengths.shape)
print(input_scale.shape)

(517, 16, 52) (517, 16, 26) (517,)
(517, 2)


In [12]:
l1, l2 = len(dataset), len(groundtruth)
p1, p2 = 0.8, 0.9
# Split in train, validation and test
training_kp, val_kp, test_kp = dataset[:round(p1*l1)], dataset[round(p1*l1):round(p2*l1)], dataset[round(p2*l1):]
training_lbl, val_lbl, test_lbl = groundtruth[:round(p1*l2)], groundtruth[round(p1*l2):round(p2*l2)], groundtruth[round(p2*l2):]
training_lengths, val_lengths, test_lengths = lengths[:round(p1*l1)], lengths[round(p1*l1):round(p2*l1)], lengths[round(p2*l1):]
training_inpscale, val_inpscale, test_inpscale = input_scale[:round(p1*l1)], input_scale[round(p1*l1):round(p2*l1)], input_scale[round(p2*l1):]

print(training_kp.shape, val_kp.shape, test_kp.shape)
print(training_lbl.shape, val_lbl.shape, test_lbl.shape)
print(training_lengths.shape, val_lengths.shape, test_lengths.shape)

(414, 16, 52) (51, 16, 52) (52, 16, 52)
(414, 16, 26) (51, 16, 26) (52, 16, 26)
(414,) (51,) (52,)


In [13]:
training_outscale = np.asarray(norm_uniform(training_lbl)).squeeze()
val_outscale_t = np.asarray(norm_uniform(val_lbl)).squeeze()
test_outscale_t = np.asarray(norm_uniform(test_lbl)).squeeze()
print(training_outscale.shape, val_outscale_t.shape)

(414,) (51,)


In [14]:
print(np.nanmax(training_lbl), np.nanmin(training_lbl), np.nanmean(training_lbl))
print(training_lbl[2, 3])

1.0 -1.0 0.12527524045876265
[ 0.04352544 -0.6486944   0.125155    0.81622386  0.96912597  0.29878608
  0.57455247  0.34076996  0.32623056 -0.56789676 -0.99718322 -0.54647916
 -0.32425162 -0.25939084 -0.01751753 -0.52660378  0.02438939 -0.59617183
 -0.13520193 -0.39839417 -0.21309372  0.42104098 -0.82797127 -0.64990917
  0.21086643 -0.38008587]


In [15]:
alpha = LinearRegression(fit_intercept=True)#make_pipeline(PolynomialFeatures(9), LinearRegression(fit_intercept=True))
alpha.fit(training_inpscale, training_outscale)
val_outscale = alpha.predict(val_inpscale)
test_outscale = alpha.predict(test_inpscale)
print(val_outscale.shape, test_outscale.shape)

(51,) (52,)


In [16]:
val_outscale = np.concatenate((val_outscale[:,np.newaxis], val_outscale_t[:,np.newaxis]), axis=1)
test_outscale = np.concatenate((test_outscale[:,np.newaxis], test_outscale_t[:,np.newaxis]), axis=1)

print(val_outscale.shape)

(51, 2)


In [17]:
def convert_to_bins(tensor):
    max_z = np.nanmax(tensor)
    min_z = np.nanmin(tensor)
    z_to_bins = [min_z+i*(max_z-min_z)/21 for i in range(1, 22)]
    with np.nditer(tensor, op_flags=['readwrite']) as it:
        for x in it:
            for z in z_to_bins:
                if x < z:
                    x[...] = z_to_bins.index(z)
                    break
    return z_to_bins


In [18]:
z_to_bins_tr = convert_to_bins(training_lbl)
z_to_bins_val = convert_to_bins(val_lbl)
z_to_bins_test = convert_to_bins(test_lbl)
print(z_to_bins_tr, groundtruth.shape)
print(training_lbl[2,3])

[-0.9047619047619048, -0.8095238095238095, -0.7142857142857143, -0.6190476190476191, -0.5238095238095238, -0.4285714285714286, -0.33333333333333337, -0.23809523809523814, -0.1428571428571429, -0.04761904761904767, 0.04761904761904767, 0.1428571428571428, 0.23809523809523814, 0.33333333333333326, 0.4285714285714286, 0.5238095238095237, 0.6190476190476191, 0.7142857142857142, 0.8095238095238095, 0.9047619047619047, 1.0] (517, 16, 26)
[10.  3. 11. 19. 20. 13. 16. 14. 13.  4.  0.  4.  7.  7. 10.  4. 10.  4.
  9.  6.  8. 14.  1.  3. 12.  6.]


In [19]:
# From python lists to pytorch tensors.
training_kp, val_kp, test_kp = torch.tensor(np.nan_to_num(training_kp), dtype=torch.float32), torch.tensor(np.nan_to_num(val_kp), dtype=torch.float32), torch.tensor(np.nan_to_num(test_kp), dtype=torch.float32)
training_lbl, val_lbl, test_lbl = torch.tensor(np.nan_to_num(training_lbl), dtype=torch.long), torch.tensor(np.nan_to_num(val_lbl), dtype=torch.long), torch.tensor(np.nan_to_num(test_lbl), dtype=torch.long)
training_lengths, val_lengths, test_lengths = torch.tensor(np.nan_to_num(training_lengths), dtype=torch.float32), torch.tensor(np.nan_to_num(val_lengths), dtype=torch.float32), torch.tensor(np.nan_to_num(test_lengths), dtype=torch.float32)
training_inpscale, val_inpscale, test_inpscale = torch.tensor(np.nan_to_num(training_inpscale), dtype=torch.float32), torch.tensor(np.nan_to_num(val_inpscale), dtype=torch.float32), torch.tensor(np.nan_to_num(test_inpscale), dtype=torch.float32)
training_outscale, val_outscale, test_outscale = torch.tensor(np.nan_to_num(training_outscale), dtype=torch.float32), torch.tensor(np.nan_to_num(val_outscale), dtype=torch.float32), torch.tensor(np.nan_to_num(test_outscale), dtype=torch.float32)

print(training_kp.shape, val_lbl.shape, test_lengths.shape)
print(training_inpscale.shape, training_outscale.shape)

torch.Size([414, 16, 52]) torch.Size([51, 16, 26]) torch.Size([52])
torch.Size([414, 2]) torch.Size([414])


Finally we define the batch_size and put the datasets in DataLoaders.

In [20]:
train_data = TensorDataset(training_kp, training_lbl, training_lengths,
                          training_inpscale, training_outscale)
val_data = TensorDataset(val_kp, val_lbl, val_lengths,
                        val_inpscale, val_outscale)
test_data = TensorDataset(test_kp, test_lbl, test_lengths,
                         test_inpscale, test_outscale)

batch_size = 32

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7fd4146b3d50>


If we have a GPU available we set our device to GPU.

In [21]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


Let's print some examples to see whether it is loaded correctly or not.

In [22]:
dataiter = iter(train_loader)
sample_x, sample_y, sample_len, iscale, oscale = dataiter.next()

print(sample_x.shape, sample_y.shape, sample_len.shape)

torch.Size([32, 16, 52]) torch.Size([32, 16, 26]) torch.Size([32])


## Model building
It is time to build the model for this approach. It will consist on a single/double layer LSTM followed by a Linear layer with output size the number of keypoints we want to estimate. I also define a method to initialize the hidden_state of the cell.

In [23]:
class LSTM_2D3D(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_dim, n_layers, bidirectional, bins, dropout=0.):
        super().__init__()
        # Save the model parameters
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bi = bidirectional
        self.bins = bins
        
        # Define the architecture
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*(2 if self.bi else 1), 256),
            nn.Linear(256, output_size)
        )
        self.softmax=nn.LogSoftmax(dim=2)
        
    def forward(self, x, state, lengths):
        # Describe the forward step
        batch_size, seq_len = x.size(0), x.size(1) # We save the batch size and the (maximum) sequence length
        
        # Need to pack a tensor containing padded sequences of variable length
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths=lengths, batch_first=True, enforce_sorted=False)
        ht, hidden_state = self.lstm(packed, state) # ht will be a PackedSequence

        # Need to flatten and reshape the output to feed it to the Linear layer
        ht = ht.data.contiguous() # ht will be of shape [sum(lengths), hidden_dim]
        ot = self.fc(ht) # ot will be of shape [sum(lengths), ouput_size]
        ot = ot.view(-1, self.output_size//self.bins, self.bins) #shape [sum(lengths), kp, bins]
        ot = self.softmax(ot)
        ot = torch.transpose(ot, 1, 2) # Transpose 'cause NLLLoss need the classes dimension as the second

        l_ot = [ot[:int(length)] for length in lengths] # list of batch elements, each shape [lengths[i], bins, kp]
        packed_ot = nn.utils.rnn.pack_sequence(l_ot, enforce_sorted=False) # PackedSequence
        # Finally return to shape [batch_size, seq_len, bins, kp]
        ot, _ = nn.utils.rnn.pad_packed_sequence(packed_ot, batch_first=True, total_length=seq_len)
        
        return ot, hidden_state
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers*(2 if self.bi else 1), batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers*(2 if self.bi else 1), batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [24]:
# Define some model parameters
BINS = 21
INPUT_SIZE = sample_x.size(2)
OUTPUT_SIZE = sample_y.size(2)*BINS
HIDDEN_DIM = 512
N_LAYERS = 3
BIDIRECTIONAL = False

# Instantiate the model
model = LSTM_2D3D(INPUT_SIZE, OUTPUT_SIZE, HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, BINS, dropout=0.)
model.to(device)
print(model)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

LSTM_2D3D(
  (lstm): LSTM(52, 512, num_layers=3, batch_first=True)
  (fc): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=546, bias=True)
  )
  (softmax): LogSoftmax()
)
5633314


## Training
Now we will proceed with the training. The first cell will define the learning rate, the loss function and the selected optimizer for the training process. Then we will proceed with a training over a number of epochs in which we will print it's training loss and validation loss. I also will be using Tensorboard to have a much nicer view of the results.

In [25]:
def thresholded_output_transform(output):
    y_pred, y = output
    for i in range(y_pred.shape[2]):
        indices = y_pred[:,:,i].max(dim=1)[1]
        y_pred[:,:,i] = 0
        for j in range(len(indices)):
            y_pred[j,indices[j],i] = 1

    return y_pred, y

accuracy = Accuracy(thresholded_output_transform)


In [26]:
NUM_EPOCHS = 40
lr = 4e-6
loss_function = nn.NLLLoss()
one_cycle = True
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0)
if one_cycle:
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, 
                                                  steps_per_epoch=len(train_loader), epochs=NUM_EPOCHS,
                                                  div_factor=20.0, final_div_factor=1000.0)

In [27]:
from datetime import datetime
name = 'lot_body_class'
writer = SummaryWriter(log_dir=f'/deeplearning/logs/{name}{datetime.now()}_lr-{lr}_{NUM_EPOCHS}')

In [28]:
timer_beg = timer()

tr_losses = []
val_losses = []

model.train()
for i in range(NUM_EPOCHS):
    # Init the hidden state (ht, ct)
    h = model.init_hidden(batch_size)
    batch_losses = []
    
    if i+1 == NUM_EPOCHS:
        preds, inps, labls, lens = [], [], [], []
        val_preds, val_inps, val_labls, val_lens = [], [], [], []
        iscale, oscale, val_iscale, val_oscale = [], [], [], []
        
    for inputs, labels, lengths, i_s, o_s in train_loader:
        h = tuple([e.data for e in h])
        inputs, labels, lengths = inputs.to(device), labels.to(device), lengths.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward step
        output, h = model(inputs, h, lengths)
        
        if i+1 == NUM_EPOCHS:
            e = [preds, inps, labls, lens, iscale, oscale]
            b = [output, inputs, labels, lengths, i_s, o_s]
            for k in range(len(e)):
                e[k].append(b[k])            

        # Loss calculation and backward step
        loss = loss_function(nn.utils.rnn.pack_padded_sequence(output, lengths=lengths, batch_first=True, 
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(labels, lengths=lengths, batch_first=True,
                                                            enforce_sorted=False).data)
        loss.backward()
        # Weight update
        optimizer.step()
        # One cycle policy step
        if one_cycle:
            scheduler.step()
        
        # Output data collection for showing
        batch_losses.append(loss.item())
    
    timer_end = timer()
    tr_losses.append(np.mean(batch_losses))
    writer.add_scalar('Loss/train', tr_losses[-1], i)   
    
    # Validation at the end of an epoch
    val_h = model.init_hidden(batch_size)
    model.eval()
    val_loss = []
    
    for inp, lab, lns, vis, vos in val_loader:
        val_h = tuple([each.data for each in val_h])
        inp, lab, lns = inp.to(device), lab.to(device), lns.to(device)
        out, val_h = model(inp, val_h, lns)
        
        if i+1 == NUM_EPOCHS:
            e = [val_preds, val_inps, val_labls, val_lens, val_iscale, val_oscale]
            b = [out, inp, lab, lns, vis, vos]
            for k in range(len(e)):
                e[k].append(b[k])  
        
        loss = loss_function(nn.utils.rnn.pack_padded_sequence(out, lengths=lns, batch_first=True,
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(lab, lengths=lns, batch_first=True,
                                                               enforce_sorted=False).data)
        val_loss.append(loss.item())
    
    val_losses.append(np.mean(val_loss))
    writer.add_scalar('Loss/validation', val_losses[-1], i)  
    model.train()
    
    # Output loss and training time.
    print(f"Finished epoch {i+1}/{NUM_EPOCHS} in {(timer_end-timer_beg):.2f}s.\n",
             f"Loss: {np.mean(tr_losses[-1]):.4f}",
             f" Val Loss: {val_losses[-1]:.4f}")
    timer_beg = timer()

plt.figure()
plt.plot(tr_losses, label='train')
plt.plot(val_losses, label='validation')
plt.xlabel('Epoch')
plt.ylabel('NLLLoss')
plt.legend()

Finished epoch 1/40 in 0.59s.
 Loss: 3.0481  Val Loss: 3.0453
Finished epoch 2/40 in 0.58s.
 Loss: 3.0480  Val Loss: 3.0458
Finished epoch 3/40 in 0.58s.
 Loss: 3.0478  Val Loss: 3.0457
Finished epoch 4/40 in 0.57s.
 Loss: 3.0479  Val Loss: 3.0456
Finished epoch 5/40 in 0.57s.
 Loss: 3.0474  Val Loss: 3.0454
Finished epoch 6/40 in 0.57s.
 Loss: 3.0472  Val Loss: 3.0454
Finished epoch 7/40 in 0.57s.
 Loss: 3.0467  Val Loss: 3.0456
Finished epoch 8/40 in 0.58s.
 Loss: 3.0462  Val Loss: 3.0455
Finished epoch 9/40 in 0.58s.
 Loss: 3.0454  Val Loss: 3.0442
Finished epoch 10/40 in 0.58s.
 Loss: 3.0446  Val Loss: 3.0447
Finished epoch 11/40 in 0.58s.
 Loss: 3.0439  Val Loss: 3.0436
Finished epoch 12/40 in 0.59s.
 Loss: 3.0428  Val Loss: 3.0430
Finished epoch 13/40 in 0.58s.
 Loss: 3.0416  Val Loss: 3.0423
Finished epoch 14/40 in 0.57s.
 Loss: 3.0404  Val Loss: 3.0425
Finished epoch 15/40 in 0.57s.
 Loss: 3.0390  Val Loss: 3.0415
Finished epoch 16/40 in 0.58s.
 Loss: 3.0376  Val Loss: 3.0407
F

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x7fd373be6b50>

Save the predictions for training and validation.

In [29]:
tr_predictions = torch.cat(tuple(preds), dim=0)
tr_inputs = torch.cat(tuple(inps), dim=0)
tr_groundtruth = torch.cat(tuple(labls), dim=0)
tr_lengths = torch.cat(tuple(lens), dim=0)
tr_inp_scale, tr_out_scale = torch.cat(tuple(iscale), dim=0), torch.cat(tuple(oscale), dim=0)
print(tr_inp_scale.shape, tr_inputs.shape)

val_predictions = torch.cat(tuple(val_preds), dim=0)
val_inputs = torch.cat(tuple(val_inps), dim=0)
val_groundtruth = torch.cat(tuple(val_labls), dim=0)
val_length = torch.cat(tuple(val_lens), dim=0)
val_inp_scale, val_out_scale = torch.cat(tuple(val_iscale), dim=0), torch.cat(tuple(val_oscale), dim=0)

torch.Size([384, 2]) torch.Size([384, 16, 52])


In [30]:
accuracy.update((nn.utils.rnn.pack_padded_sequence(tr_predictions, lengths=tr_lengths, batch_first=True, 
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(tr_groundtruth, lengths=tr_lengths, batch_first=True,
                                                               enforce_sorted=False).data))
train_acc = accuracy.compute()
accuracy.update((nn.utils.rnn.pack_padded_sequence(val_predictions, lengths=val_length, batch_first=True, 
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(val_groundtruth, lengths=val_length, batch_first=True,
                                                               enforce_sorted=False).data))
val_acc = accuracy.compute()
print(f"Training accuracy: {train_acc*100:.3f} Validation accuracy: {val_acc*100:.3f}")

Training accuracy: 13.969 Validation accuracy: 13.445


In [31]:
torch.save(model.state_dict(), f'./{name}.pt')

In [32]:
model.load_state_dict(torch.load(f'./{name}.pt'))

<All keys matched successfully>

## Testing
After the training, we shall proceed with the performance test. This will go through the test batches and perform the inference, then it will show the test loss, as well as the performance metric. In this case, as we are working with human body keypoints, we will use the Mean Per Joint Position Error (MPJPE) metric, which outputs the mean euclidean distance between the joints (keypoints) positions estimated and the ones in the groundtruth.

The formula for MPJPE is the following:

$\text{MPJPE} = \frac1T\frac1N\displaystyle\sum_{t=1}^{T}\displaystyle\sum_{i=1}^{N}\|(J_{i}^{(t)}-J_{root}^{(t)})-(Ĵ_{i}^{(t)}-Ĵ_{root}^{(t)})\|$

Where N is the number of joints, and T the number of samples.

As we have seen in the formula above, we need to align the root joints of the labels and the network output. In order to do that, I have defined a function (`substract_root_PJPE`) that substracts the root joint of each keypoint set (face, hands, body) in the corresponding keypoint set.

In [33]:
test_losses = []
MPJPE = []
h = model.init_hidden(batch_size)
preds, inps, labls, lengs = [], [], [], []
iscal, oscal = [], []
test_CK = [0,0]

model.eval()
for inputs_test, labels_test, lengths_test, is_test, os_test in test_loader:
    
    h = tuple([each.data for each in h])
    inputs_test, labels_test, lengths_test = inputs_test.to(device), labels_test.to(device), lengths_test.to(device)
    
    output_test, h = model(inputs_test, h, lengths_test)
    
    e = [preds, inps, labls, lengs, iscal, oscal]
    b = [output_test, inputs_test, labels_test, lengths_test, is_test, os_test]
    for k in range(len(e)):
        e[k].append(b[k])      
    
    test_loss = loss_function(nn.utils.rnn.pack_padded_sequence(output_test, lengths=lengths_test, batch_first=True,
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(labels_test, lengths=lengths_test, batch_first=True,
                                                               enforce_sorted=False).data)
    test_losses.append(test_loss.item())

test_predictions = torch.cat(tuple(preds), dim=0)
test_inputs = torch.cat(tuple(inps), dim=0)
test_groundtruth = torch.cat(tuple(labls), dim=0)
test_lengths = torch.cat(tuple(lengs), dim=0)
test_inp_scale, test_out_scale = torch.cat(tuple(iscal), dim=0), torch.cat(tuple(oscal), dim=0)
accuracy.update((nn.utils.rnn.pack_padded_sequence(test_predictions, lengths=test_lengths, batch_first=True, 
                                                               enforce_sorted=False).data,
                             nn.utils.rnn.pack_padded_sequence(test_groundtruth, lengths=test_lengths, batch_first=True,
                                                               enforce_sorted=False).data))
test_acc = accuracy.compute()

In [34]:
print(f"Test loss: {np.mean(test_losses):.4f}", f"\nTest accuracy: {test_acc*100:.3f}")

Test loss: 3.0370 
Test accuracy: 12.939


### Save the results into a json

In [35]:
results = {'train':{'inputs':tr_inputs.tolist(), 'predictions':tr_predictions.tolist(), 'labels':tr_groundtruth.tolist(), 'lengths':tr_lengths.tolist(), 
                    'is':tr_inp_scale.tolist(), 'os':tr_out_scale.tolist()},
          'validation':{'inputs':val_inputs.tolist(), 'predictions':val_predictions.tolist(), 'labels':val_groundtruth.tolist(), 'lengths':val_length.tolist(),
                       'is':val_inp_scale.tolist(), 'os':val_out_scale.tolist()},
          'test':{'inputs':test_inputs.tolist(), 'predictions':test_predictions.tolist(), 'labels':test_groundtruth.tolist(), 'lengths':test_lengths.tolist(),
                 'is':test_inp_scale.tolist(), 'os':test_out_scale.tolist()}}
with open('../../results/clas_small_body.json', 'w') as fp:
    json.dump(results, fp)

### Load results from json

In [None]:
with open('../../results/clas_small_body.json', 'r') as j:
    jd = json.load(j)
    tr, val, test = jd['train'], jd['validation'], jd['test']
    tr_inputs, tr_predictions, tr_groundtruth, tr_lengths, tr_inp_scale, tr_out_scale = tuple(torch.tensor(tr[n]) for n in ['inputs', 'predictions',
                                                                                              'labels', 'lengths',
                                                                                               'is', 'os'])
    val_inputs, val_predictions, val_groundtruth, val_length, val_inp_scale, val_out_scale = tuple(torch.tensor(val[n]) for n in ['inputs', 'predictions',
                                                                                                  'labels', 'lengths',
                                                                                                  'is', 'os'])
    test_inputs, test_predictions, test_groundtruth, test_lengths, test_inp_scale, test_out_scale = tuple(torch.tensor(test[n]) for n in ['inputs', 'predictions',
                                                                                                         'labels', 'lengths',
                                                                                                         'is', 'os'])

In [3]:
tr_inputs.shape, val_predictions.shape, test_lengths.shape

(torch.Size([288, 250, 276]), torch.Size([32, 250, 138]), torch.Size([32]))

## Interpreation

Now to better understanding of the results, I will plot some of the frames from the last batches on the training and validation, and also from testing.

In [114]:
def plot_and_rotate(c_inputs, c_z, frames, frame):
    c_inputs[:,::2].mul_(mom_x[1])
    c_inputs[:,1::2].mul_(mom_y[1])
    c_z.mul_(stdz)

    bodiesXY = torch.chunk(c_inputs[frames, :], len(frames), dim=0)
    bodiesZ = torch.chunk(c_z[frames, :], len(frames), dim=0)
    
    x = bodiesXY[frame].squeeze()[::2]
    y = bodiesXY[frame].squeeze()[1::2]
    z = bodiesZ[frame].squeeze()
    
    r_eyebrow = [[c[i] for i in range(17, 22)] for c in [x,y,z]]
    l_eyebrow = [[c[i] for i in range(22, 27)] for c in [x,y,z]]
    l_eye = [[c[i] for i in range(42, 48)] for c in [x,y,z]]
    r_eye = [[c[i] for i in range(36, 42)] for c in [x,y,z]]
    nose1 = [[c[i] for i in range(27, 31)] for c in [x,y,z]]
    nose2 = [[c[i] for i in range(31, 36)] for c in [x,y,z]]
    ext_mouth = [[c[i] for i in range(48, 60)] for c in [x,y,z]]
    int_mouth = [[c[i] for i in range(60, 68)] for c in [x,y,z]]
    contour = [[c[i] for i in range(0, 17)] for c in [x,y,z]]
    
    l_arm = [[c[i+112] for i in [1, 0, 9, 10, 11]] for c in [x,y,z]]
    r_arm = [[c[i+112] for i in [0, 3, 4, 5]] for c in [x,y,z]]
    l_leg = [[c[i+112] for i in [0, 2, 12, 13, 14, 22, 23, 24]] for c in [x,y,z]]
    r_leg = [[c[i+112] for i in [2, 6, 7, 8, 19, 20, 21]] for c in [x,y,z]]
    head = [[c[i+112] for i in [18, 17, 1, 15, 16]] for c in [x,y,z]]

    rh0 = [[c[i+70] for i in [0, 1, 2, 3, 4]] for c in [x,y,z]]
    rh1 = [[c[i+70] for i in [0, 5, 6, 7, 8]] for c in [x,y,z]]
    rh2 = [[c[i+70] for i in [0, 9, 10, 11, 12]] for c in [x,y,z]]
    rh3 = [[c[i+70] for i in [0, 13, 14, 15, 16]] for c in [x,y,z]]
    rh4 = [[c[i+70] for i in [0, 17, 18, 19, 20]] for c in [x,y,z]]
    lh0 = [[c[i+91] for i in [0, 1, 2, 3, 4]] for c in [x,y,z]]
    lh1 = [[c[i+91] for i in [0, 5, 6, 7, 8]] for c in [x,y,z]]
    lh2 = [[c[i+91] for i in [0, 9, 10, 11, 12]] for c in [x,y,z]]
    lh3 = [[c[i+91] for i in [0, 13, 14, 15, 16]] for c in [x,y,z]]
    lh4 = [[c[i+91] for i in [0, 17, 18, 19, 20]] for c in [x,y,z]]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    def init():
        ax.plot(r_eyebrow[0], r_eyebrow[2], r_eyebrow[1])
        ax.plot(l_eyebrow[0], l_eyebrow[2], l_eyebrow[1])
        ax.plot(l_eye[0], l_eye[2], l_eye[1])
        ax.plot(r_eye[0], r_eye[2], r_eye[1])
        ax.plot(nose1[0], nose1[2], nose1[1])
        ax.plot(nose2[0], nose2[2], nose2[1])
        ax.plot(ext_mouth[0], ext_mouth[2], ext_mouth[1])
        ax.plot(int_mouth[0], int_mouth[2], int_mouth[1])
        
        ax.plot(rh0[0], rh0[2], rh0[1])
        ax.plot(rh1[0], rh1[2], rh1[1])
        ax.plot(rh2[0], rh2[2], rh2[1])
        ax.plot(rh3[0], rh3[2], rh3[1])
        ax.plot(rh4[0], rh4[2], rh4[1])
        
        ax.plot(lh0[0], lh0[2], lh0[1])
        ax.plot(lh1[0], lh1[2], lh1[1])
        ax.plot(lh2[0], lh2[2], lh2[1])
        ax.plot(lh3[0], lh3[2], lh3[1])
        ax.plot(lh4[0], lh4[2], lh4[1])
        
        ax.plot(r_arm[0], r_arm[2], r_arm[1])
        ax.plot(l_arm[0], l_arm[2], l_arm[1])
        ax.plot(r_leg[0], r_leg[2], r_leg[1])
        ax.plot(l_leg[0], l_leg[2], l_leg[1])
        ax.plot(head[0], head[2], head[1])
        
        lims = ax.get_xlim(), ax.get_ylim(), ax.get_zlim()
        spans = lims[0][1]-lims[0][0], lims[1][1]-lims[1][0], lims[2][1]-lims[2][0]
        span = max(spans)
        margins = [(span-s)/2 for  s in spans]
        ax.set_xlim(lims[0][0]-margins[0], lims[0][1]+margins[0])
        ax.set_ylim(lims[1][0]-margins[1], lims[1][1]+margins[1])
        ax.set_zlim(lims[2][0]-margins[2], lims[2][1]+margins[2])
        

        return fig,

    def animate(i):
        ax.view_init(elev=220., azim=3.6*i)
        return fig,

    # Animate
    ani = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=100, interval=100, blit=True)    

    return ani

In [36]:
def plot_frames(predictions, groundtruth, inputs, video_n, frames, rot, train=False):
    
    inp = inputs.clone()
    preds = predictions.clone()
    grtr = groundtruth.clone()
    
    bodiesXY = torch.chunk(inp[video_n, frames, :], len(frames), dim=0)
    pred_bodiesZ = torch.chunk(preds[video_n, frames, :], len(frames), dim=0)
    true_bodiesZ = torch.chunk(grtr[video_n, frames, :], len(frames), dim=0)
   
    nrows = np.ceil(len(frames)/2)
    fig = plt.figure(figsize=(15, 6*nrows))
    fig2 = plt.figure(figsize=(15, 6*nrows))
    for frame in range(len(frames)):
        x = bodiesXY[frame].squeeze()[::2].tolist()
        y = bodiesXY[frame].squeeze()[1::2].tolist()
        pred_z = pred_bodiesZ[frame].squeeze().tolist()
        true_z = true_bodiesZ[frame].squeeze().tolist()
        
        r = R.from_euler('y', rot, degrees=True)
        
        xyz1, xyz2 = np.asarray([c for c in zip(x, y, pred_z)]), np.asarray([c for c in zip(x, y, true_z)])
        xyz1, xyz2 = r.apply(xyz1), r.apply(xyz2)
        x1, x2 = xyz1[:,0], xyz2[:,0]
        y1, y2 = xyz1[:,1], xyz2[:,1]
        pred_z, true_z = xyz1[:,2], xyz2[:,2]
        if not train:
            print((x2.max()-x2.min())/(x1.max()-x1.min()))
            x1 = x1*((x2.max()-x2.min())/(x1.max()-x1.min()))
        
        r_arm = tuple([[c[i] for i in [1, 0, 9, 10, 11]] for c in l] for l in [[x1, y1, pred_z], [x2, y2, true_z]])
        l_arm = tuple([[c[i] for i in [0, 3, 4, 5]] for c in l] for l in [[x1, y1, pred_z], [x2, y2, true_z]])
        r_leg = tuple([[c[i] for i in [0, 2, 12, 13, 14, 22, 23, 24]] for c in l] for l in [[x1, y1, pred_z], [x2, y2, true_z]])
        l_leg = tuple([[c[i] for i in [2, 6, 7, 8, 19, 20, 21]] for c in l] for l in [[x1, y1, pred_z], [x2, y2, true_z]])
        head = tuple([[c[i] for i in [18, 17, 1, 15, 16]] for c in l] for l in [[x1, y1, pred_z], [x2, y2, true_z]])

        ax = fig.add_subplot(nrows, 2, frame+1, projection='3d')
        
        ax.plot(r_arm[0][0], r_arm[0][1], r_arm[0][2])
        ax.plot(l_arm[0][0], l_arm[0][1], l_arm[0][2])
        ax.plot(r_leg[0][0], r_leg[0][1], r_leg[0][2])
        ax.plot(l_leg[0][0], l_leg[0][1], l_leg[0][2])
        ax.plot(head[0][0], head[0][1], head[0][2])
        
        ax2 = fig2.add_subplot(nrows, 2, frame+1, projection='3d')
        
        ax2.plot(r_arm[1][0], r_arm[1][1], r_arm[1][2])
        ax2.plot(l_arm[1][0], l_arm[1][1], l_arm[1][2])
        ax2.plot(r_leg[1][0], r_leg[1][1], r_leg[1][2])
        ax2.plot(l_leg[1][0], l_leg[1][1], l_leg[1][2])
        ax2.plot(head[1][0], head[1][1], head[1][2])
        

        lims = ax.get_xlim(), ax.get_ylim(), ax.get_zlim()
        spans = lims[0][1]-lims[0][0], lims[1][1]-lims[1][0], lims[2][1]-lims[2][0]
        span = max(spans)
        margins = [(span-s)/2 for  s in spans]
        ax.set_xlim(lims[0][0]-margins[0], lims[0][1]+margins[0])
        ax.set_ylim(lims[1][0]-margins[1], lims[1][1]+margins[1])
        ax.set_zlim(lims[2][0]-margins[2], lims[2][1]+margins[2])

        lims2 = ax2.get_xlim(), ax2.get_ylim(), ax2.get_zlim()
        spans2 = lims2[0][1]-lims2[0][0], lims2[1][1]-lims2[1][0], lims2[2][1]-lims2[2][0]
        span2 = max(spans2)
        margins2 = [(span2-s)/2 for  s in spans2]
        ax2.set_xlim(lims2[0][0]-margins2[0], lims2[0][1]+margins2[0])
        ax2.set_ylim(lims2[1][0]-margins2[1], lims2[1][1]+margins2[1])
        ax2.set_zlim(lims2[2][0]-margins2[2], lims2[2][1]+margins2[2])

        ax.view_init(elev=-65., azim=-90.)
        ax2.view_init(elev=-65., azim=-90.)


### Single frame
On the first cell you can select which frames you want to plot and from which video of the batch. On the second you select which frame of the previosly selected you want to plot, specifying its index on the declared "frames" list.

In [None]:
# Last batches of training -output, inputs, labels-.
vid = 1
frames = [100]

c_inputs = training_kp[vid].clone()
c_labels = training_lbl[vid].clone()
c_inputs[:,::2].mul_(training_inpscale[vid, 0])
c_inputs[:,1::2].mul_(training_inpscale[vid, 1])
c_labels.mul_(training_outscale[vid])

In [None]:
HTML(plot_and_rotate(c_inputs, c_output, frames, 0).to_html5_video())

In [None]:
c_inputs = tr_inputs[vid].clone()
HTML(plot_and_rotate(c_inputs, c_labels, frames, 0).to_html5_video())

We repeat the same process for the last test batch.

### Slice of frames
Now let's plot a sequence of frames of the selected video. We will plot both the groundtruth and the predicted.

In [37]:
y_pred, y = thresholded_output_transform((tr_predictions.view(-1,BINS,OUTPUT_SIZE//BINS), tr_groundtruth))
y_pred = y_pred.view(tr_predictions.shape)
y_pred = torch.transpose(y_pred, 2, 3)
y_pred_new = torch.zeros(y_pred.shape[:-1], dtype=torch.long)
print(y_pred[0,0])
print(y_pred_new.shape)
for i in range(len(y_pred)):
    for j in range(len(y_pred[i])):
        for k in range(len(y_pred[i,j])):
            y_pred_new[i,j,k] = torch.where(y_pred[i,j,k]==1.)[0].item()
print(y_pred_new[0,0])

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 

In [38]:
frames = [i for i in range(1,9,2)]
video_n = 73

c_inputs = tr_inputs.clone()
c_output = y_pred_new.clone().float()
c_labels = tr_groundtruth.clone().float()
print(c_inputs.dtype, c_output.dtype, c_labels.dtype)
for vid in range(c_labels.shape[0]): 
    c_inputs[vid,:,::2].mul_(tr_inp_scale[vid, 0]/(c_inputs[vid,:,::2].max()-c_inputs[vid,:,::2].min()))
    c_inputs[vid,:,1::2].mul_(tr_inp_scale[vid, 1]/(c_inputs[vid,:,1::2].max()-c_inputs[vid,:,1::2].min()))
    c_output[vid].mul_(tr_out_scale[vid]/21)
    c_labels[vid].mul_(tr_out_scale[vid]/21)


torch.float32 torch.float32 torch.float32


In [39]:
plot_frames(c_output, c_labels, c_inputs, video_n, frames, -90, True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [40]:
y_pred, y = thresholded_output_transform((test_predictions.view(-1,BINS,OUTPUT_SIZE//BINS), test_groundtruth))
y_pred = y_pred.view(test_predictions.shape)
y_pred = torch.transpose(y_pred, 2, 3)
y_pred_new = torch.zeros(y_pred.shape[:-1], dtype=torch.long)
print(y_pred[0,0])
print(y_pred_new.shape)
for i in range(len(y_pred)):
    for j in range(len(y_pred[i])):
        for k in range(len(y_pred[i,j])):
            y_pred_new[i,j,k] = torch.where(y_pred[i,j,k]==1.)[0].item()
print(y_pred_new[0,0])

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 

In [43]:
frames = [i for i in range(1,9,2)]
video_n = 23

c_inputs = test_inputs.clone()
c_output = y_pred_new.clone().float()
c_labels = test_groundtruth.clone().float()

for vid in range(c_labels.shape[0]): 
    c_inputs[vid,:,::2].mul_(test_inp_scale[vid, 0]/(c_inputs[vid,:,::2].max()-c_inputs[vid,:,::2].min()))
    c_inputs[vid,:,1::2].mul_(test_inp_scale[vid, 1]/(c_inputs[vid,:,1::2].max()-c_inputs[vid,:,1::2].min()))
    c_output[vid].mul_(test_out_scale[vid,0]/21)
    c_labels[vid].mul_(test_out_scale[vid,1]/21)


In [44]:
plot_frames(c_output, c_labels, c_inputs, video_n, frames, -90, True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [77]:
plt.close('all')

In [56]:
frames = [i for i in range(1,9,2)]
video_n = 17

c_inputs = val_inputs.clone()
c_output = val_predictions.clone()
c_labels = val_groundtruth.clone()

for vid in range(c_labels.shape[0]): 
    c_inputs[vid,:,::2].mul_(val_inp_scale[vid, 0])
    c_inputs[vid,:,1::2].mul_(val_inp_scale[vid, 1])
    c_output[vid].mul_(val_out_scale[vid])
    c_labels[vid].mul_(val_out_scale[vid])
    
    c_inputs[vid,:,::2].mul_(val_mom_x[vid, 1])
    c_inputs[:,:,1::2].mul_(val_mom_y[vid, 1])
    c_output[vid].mul_(val_mom_z[vid])
    c_labels[vid].mul_(val_mom_z[vid])


In [57]:
plot_frames(c_output, c_labels, c_inputs, video_n, frames, -90)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …