In [3]:
import numpy as np
# Plotting utilities
import pyprind

# Directory and file utilities
from os import listdir
from os.path import isfile, isdir, join
import json

## Data acquisition
Now I will define some functions in order to parse and organise the data, and later convert it to pytorch tensors.

The data is structured as follows: in the dataset directory there are several folders, each folder corresponds to a recording; each of these folders contains a folder with the audio, folders with face, hands and body keypoints estimations for each frame, and a folder with the video recorded from different views.

In this first approach I will be using the keypoints estimations. Every keypoint folder (face, hands or body) is organized the same way: it contains a json per frame of the video, which includes the 3D keypoints estimation.

The function `get_keypoints` will go through each folder in the dataset directory and retrieve the face keypoints, the hands keypoints and the body keypoints. It will separate them into input (2D coordinates per joint per frame) and grountruth (third coordinate to estimate for each input 2D keypoint). 
The input will be of shape $([n videos, seq len, input size])$, where *seq_len* = number of frames, and *input_size* = face + hands + body keypoints, that is (70+(21+21)+26)x2 -multiplied by 2 because there are x and y coordinates-. The groundtruth (label) data will be of the same shape, except that the last dimension size will not be multiplied by 2 (there's only one coordinate to estimate).  

In [4]:
def get_keypoints(data_path):
    dataset = []
    groundtruth = []
    # Look over just the folders inside the directory
    just_folders = filter(lambda x: isdir(join(data_path, x)), listdir(data_path))
    for p in list(map(lambda x: join(data_path, x), just_folders)): 
        # Gets 2 list of n_frames lists, one for the 2D coordinates and one for the third coordinate.
        # Each list of the n_frames lists contains, either the (x and y) or the z of each keypoint for the face(first line), hands(second), body(third).
        # e.g. the first line will result in [[x1,y1,x2,y2...x70,y70]sub1...[x1,y1...x70,y70]subN], [[z1,z2...z70]sub1...[z1..z70]subN]
        # Actually, as there will be two of each list above because there are two people en each video.
        face_2d, face_3d = get_face(p)
        hands_2d, hands_3d = get_hands(p)
        pose_2d, pose_3d = get_body(p)
        
        # Concatenates the coordinates for the face, hands and body on the last dimension, for each person.
        vid_input_p1, vid_input_p2 = ([fa+ha+po for fa, ha, po in zip(face_2d[i], hands_2d[i], pose_2d[i])] for i in range(2))
        vid_labels_p1, vid_labels_p2 = ([fa+ha+po for fa, ha, po in zip(face_3d[i], hands_3d[i], pose_3d[i])] for i in range(2))
        
        dataset.append(vid_input_p1)
        dataset.append(vid_input_p2)
        groundtruth.append(vid_labels_p1)
        groundtruth.append(vid_labels_p2)
        print(f'Completed folder {p}')
    return dataset, groundtruth

The following functions are in charge of retrieving the keypoints from each json. The json face json has a key *people* with a list of person objects. Each person object has *id* field and *landmarks* field, the latter containing a list of 3D coordinates for each keypoint.

In [5]:
def get_face(path):
    face_2D_seq = ([], [])
    face_3D_seq = ([], [])
    # List only the files (json), for there might be folders containing invalid frames.
    paths = map(lambda x: join(path, 'hdFace3d', x), sorted(listdir(join(path, 'hdFace3d'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files[1:]: # The first frame of face keypoints estimation it's blank
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['people']:
                if person['id'] != -1: # If the id is -1, it means there's no person
                    x = person['face70']['landmarks'][::3]
                    y = person['face70']['landmarks'][1::3]
                    two_coord = [l[item] for item in range(len(x)) for l in [x,y]]
                    third_coord = person['face70']['landmarks'][2::3]
                    face_2D_seq[i].append(two_coord)
                    face_3D_seq[i].append(third_coord)
                    i+=1
            if i<2: # In case there was only one person detected on a frame
                face_2D_seq[i].append([0. for i in range(140)])
                face_3D_seq[i].append([0. for i in range(70)])
    print('Face completed.')
    # Each return var being a tuple with the list of n_frames list of coordinates for each person
    return face_2D_seq, face_3D_seq

The hands json contains the *landmarks* field inside both *left_hand* and *right_hand* field. As there are some frames that may not have one of the hands estimated, I've had to put some exception handling. 

In [6]:
def get_hands(path):
    hand_2D_seq = ([], [])
    hand_3D_seq = ([], [])
    paths = map(lambda x: join(path, 'hdHand3d', x), sorted(listdir(join(path, 'hdHand3d'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files[1:-1]: # The first and the last frames of these folders are blank.
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['people']:
                if person['id'] != -1:
                    try:
                        # Separate x,y from z
                        hands= [[person[hand]['landmarks'][c] 
                                 for c in range(len(person['right_hand']['landmarks'])) if (c+1)%3!=0] 
                                 for hand in ['left_hand', 'right_hand']]
                        hand_2D_seq[i].append(hands[0]+hands[1])
                        
                        hands_3d = [person[hand]['landmarks'][2::3] 
                                    for hand in ['left_hand', 'right_hand']]
                        hand_3D_seq[i].append(hands_3d[0]+hands_3d[1])
                    
                    except Exception as e: # In case left_hand or right_hand keys don't exist.
                        if 'left_hand' in str(e): 
                            try: # Just put a 0., 0., 0. estimation for each keypoint of the left_hand
                                hands = [0. for i in range(42)]+[person['right_hand']['landmarks'][c] 
                                                                 for c in range(len(person['right_hand']['landmarks'])) if (c+1)%3!=0]
                                hands_3d = [0. for i in range(21)]+person['right_hand']['landmarks'][2::3]
                            except: # In case neither left_hand nor right_hand exist
                                hands = [0. for i in range(84)]
                                hands_3d = [0. for i in range(42)]
                        elif 'right_hand' in str(e): # Just put a 0., 0., 0. estimation for each keypoint of the right_hand
                            hands = [person['left_hand']['landmarks'][c] 
                                     for c in range(len(person['left_hand']['landmarks'])) if (c+1)%3!=0]+[0. for i in range(42)]
                            hands_3d = person['left_hand']['landmarks'][2::3]+[0. for i in range(21)]

                        hand_2D_seq[i].append(hands)
                        hand_3D_seq[i].append(hands_3d)
                    i+=1
            if i<2:
                hand_2D_seq[i].append([0. for i in range(84)])
                hand_3D_seq[i].append([0. for i in range(42)])
    print('Hands completed.')
    return hand_2D_seq, hand_3D_seq

The body json is organised a bit differently, inside each person object contains the *joints26* field with a list of 3D coordinates. But this list is structured as follows: *[x1,y1,z1,acc1,x2,y2,z2,acc2...]*.

In [7]:
def get_body(path):
    body_2D_seq = ([], [])
    body_3D_seq = ([], [])
    paths = map(lambda x: join(path, 'hdPose3d_stage1_op25', x), sorted(listdir(join(path, 'hdPose3d_stage1_op25'))))
    files = list(filter(lambda x: isfile(x), paths))
    for f in files[:-1]:
        with open(f, 'r') as j:
            json_array = json.load(j)
            i = 0
            for person in json_array['bodies']:
                if person['id'] != -1:
                    x = person['joints26'][::4]
                    y = person['joints26'][1::4]
                    two_coord = [l[item] for item in range(len(x)) for l in [x,y]]
                    third_coord = person['joints26'][2::4]
                    body_2D_seq[i].append(two_coord)
                    body_3D_seq[i].append(third_coord)
                    i += 1
            if i<2:
                body_2D_seq[i].append([0. for i in range(52)])
                body_3D_seq[i].append([0. for i in range(26)])
    print('Body completed.')
    return body_2D_seq, body_3D_seq

In [8]:
data_path = '../../../data/DB keypoints'
dataset, groundtruth = get_keypoints(data_path)

Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl2
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl4
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190419_asl5
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl1
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl2
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl3
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl5
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl7
Face completed.
Hands completed.
Body completed.
Completed folder ../../../data/DB keypoints/190425_asl9
Face completed.
Hands completed.
Body completed.
Comple

## Dataset structuring
Now let's convert the lists obtained to Pytorch tensors and organise them in train, validation and test datasets. 
First, I will define a padding function in order to make all the sequences of video frames the same length, so I can train the LSTM in batches. 

In [9]:
def padding_seq(dataset):
    max_seq = max([len(x) for x in dataset])
    for seq in dataset:
        for i in range(max_seq-len(seq)):
            seq.append([np.nan for j in range(len(seq[0]))])
    return max_seq

max_seq = padding_seq(dataset)
padding_seq(groundtruth)

8751

In [10]:
# From python lists to numpy ndarray.
dataset = np.asarray(dataset)
groundtruth = np.asarray(groundtruth)
print(dataset.shape, groundtruth.shape)

(20, 8751, 276) (20, 8751, 138)


In [18]:
def stats(tensor):
    pbar = pyprind.ProgBar(len(tensor), title='Analyzing videos')
    d_min = []
    d_max = []
    d_mean = []
    for vid in tensor:
        vid_min = [np.inf, np.inf, np.inf]
        vid_max = [0., 0., 0.]
        frame_means = [[], [], []]
        pbar2 = pyprind.ProgBar(len(vid), title='Current video')
        for frame in vid:
            frame_min = [np.inf, np.inf, np.inf]
            frame_max = [0., 0., 0.]
            frame_diff = [[], [], []]
            for i in range(70):
                for j in range(i+1, 70):
                    m = np.abs(np.subtract(frame[i], frame[j]))
                    frame_min[0] = m if m<frame_min[0] else frame_min[0]
                    frame_max[0] = m if m>frame_max[0] else frame_max[0]
                    frame_diff[0].append(m)
            vid_min[0] = frame_min[0] if frame_min[0] < vid_min[0] else vid_min[0]
            vid_max[0] = frame_max[0] if frame_max[0] > vid_max[0] else vid_max[0]
            frame_means[0].append(np.nanmean(frame_diff[0]))
            
            for i in range(70, 112):
                for j in range(i+1, 112):
                    m = np.abs(np.subtract(frame[i], frame[j]))
                    frame_min[1] = m if m<frame_min[1] else frame_min[1]
                    frame_max[1] = m if m>frame_max[1] else frame_max[1]
                    frame_diff[1].append(m)
            vid_min[1] = frame_min[1] if frame_min[1] < vid_min[1] else vid_min[1]
            vid_max[1] = frame_max[1] if frame_max[1] > vid_max[1] else vid_max[1]
            frame_means[1].append(np.nanmean(frame_diff[1]))
            
            for i in range(112, len(frame)):
                for j in range(i+1, len(frame)):
                    m = np.abs(np.subtract(frame[i], frame[j]))
                    frame_min[2] = m if m<frame_min[2] else frame_min[2]
                    frame_max[2] = m if m>frame_max[2] else frame_max[2]
                    frame_diff[2].append(m)
            vid_min[2] = frame_min[2] if frame_min[2] < vid_min[2] else vid_min[2]
            vid_max[2] = frame_max[2] if frame_max[2] > vid_max[2] else vid_max[2]
            frame_means[2].append(np.nanmean(frame_diff[2]))
            pbar2.update()
        d_min.append(vid_min)
        d_max.append(vid_max)
        d_mean.append([np.nanmean(frame_means[0]), np.nanmean(frame_means[1]), np.nanmean(frame_means[2])])
        pbar.update()
    return d_min, d_max, d_mean

mins, maxs, means = stats(groundtruth)

Analyzing videos
Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:25
0% [#                   ] 100% | ETA: 00:46:02Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:22
0% [##                  ] 100% | ETA: 00:43:14Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:20
0% [###                 ] 100% | ETA: 00:40:28Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:20
0% [####                ] 100% | ETA: 00:37:56Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:19
0% [#####               ] 100% | ETA: 00:35:25Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:18
0% [######              ] 100% | ETA: 00:32:56Current video
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:18
0% [###

In [19]:
for i in range(len(mins)):
    print(f"Video {i}\n-Face min: {mins[i][0]} max: {maxs[i][0]:.2f} mean: {means[i][0]:.2f}")
    print(f"-Hands min: {mins[i][1]} max: {maxs[i][1]:.2f} mean: {means[i][1]:.2f}")
    print(f"-Body min: {mins[i][2]} max: {maxs[i][2]:.2f} mean: {means[i][2]:.2f}")

Video 0
-Face min: 0.0 max: 16.42 mean: 3.03
-Hands min: 0.0 max: 109.30 mean: 7.57
-Body min: 0.0 max: 76.65 mean: 15.38
Video 1
-Face min: 0.0 max: 85.39 mean: 3.38
-Hands min: 0.0 max: 84.73 mean: 6.09
-Body min: 0.0 max: 62.36 mean: 13.96
Video 2
-Face min: 0.0 max: 15.86 mean: 2.80
-Hands min: 0.0 max: 117.11 mean: 8.50
-Body min: 0.0 max: 75.05 mean: 15.79
Video 3
-Face min: 0.0 max: 24.25 mean: 3.41
-Hands min: 0.0 max: 68.96 mean: 5.55
-Body min: 0.0 max: 56.05 mean: 12.78
Video 4
-Face min: 0.0 max: 15.84 mean: 2.48
-Hands min: 0.0 max: 115.78 mean: 6.73
-Body min: 0.0 max: 72.20 mean: 18.82
Video 5
-Face min: 0.0 max: 85.21 mean: 3.29
-Hands min: 0.0 max: 81.01 mean: 4.90
-Body min: 0.0 max: 58.89 mean: 12.91
Video 6
-Face min: 0.0 max: 34.88 mean: 3.63
-Hands min: 0.0 max: 102.50 mean: 7.42
-Body min: 0.0 max: 80.95 mean: 20.05
Video 7
-Face min: 0.0 max: 97.41 mean: 2.96
-Hands min: 0.0 max: 103.12 mean: 8.25
-Body min: 0.0 max: 108.17 mean: 12.36
Video 8
-Face min: 0.0 max

In [17]:
print(means[0])

8.657289328643808
