# Creating HDF store
This script creates an HDF5 file based on the data from previous steps. The HDF5 file enables training on a large data set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import h5py
from pathlib import Path
from datetime import datetime

  from ._conv import register_converters as _register_converters


In [2]:
aggregated_data_path = 'AggregatedData'
HDF5_PATH = "data.hdf5"

In [3]:
def log(s):
    with open("create_hdf5_log.txt", "a") as myfile:
        myfile.write("[" + str(datetime.now()) + "] " + s + "\n")
    print("[" + str(datetime.now()) + "] " + s)

In [4]:
gridSizeXh = 25.0 / 2
gridSizeYh = 25.0 / 2
gridSizeZo = 25 - 5

def getLabelCoordinates(df):
    # sort the data to the right order and remove all R_Shape data
    df = df[df['label'].str.split('_').str[1] != "R"].copy(deep=True)
    
    df.label = pd.Categorical(df.label, 
                          categories=["R_Thumb_Fn", "R_Thumb_DIP", "R_Thumb_PIP", "R_Thumb_MCP",
                                     "R_Index_Fn", "R_Index_DIP", "R_Index_PIP", "R_Index_MCP",
                                     "R_Middle_Fn", "R_Middle_DIP", "R_Middle_PIP", "R_Middle_MCP",
                                     "R_Ring_Fn", "R_Ring_DIP", "R_Ring_PIP", "R_Ring_MCP",
                                     "R_Little_Fn", "R_Little_DIP", "R_Little_PIP", "R_Little_MCP",
#                                      "R_R_Shape_1", "R_R_Shape_2", "R_R_Shape_3", "R_R_Shape_4", 
                                      "R_Wrist",

                                     "L_Thumb_Fn", "L_Thumb_DIP", "L_Thumb_PIP", "L_Thumb_MCP",
                                     "L_Index_Fn", "L_Index_DIP", "L_Index_PIP", "L_Index_MCP",
                                     "L_Middle_Fn", "L_Middle_DIP", "L_Middle_PIP", "L_Middle_MCP",
                                     "L_Ring_Fn", "L_Ring_DIP", "L_Ring_PIP", "L_Ring_MCP",
                                     "L_Little_Fn", "L_Little_DIP", "L_Little_PIP", "L_Little_MCP",
#                                      "L_R_Shape_1", "L_R_Shape_2", "L_R_Shape_3", "L_R_Shape_4", 
                                      "L_Wrist"],
                          ordered=True)

    df.sort_values('label', inplace=True)
    
    groups = df.groupby('time')

    coordinates = []
    size = len(groups)
    for name,group in groups:
        # filter outliers
        if group.XRot.abs().max() > gridSizeXh:
            size = size - 1
            continue
        if group.YRot.abs().max() > gridSizeYh:
            size = size - 1
            continue
        if group.ZRot.abs().max() > gridSizeZo:
            size = size - 1
            continue
        for index, row in group.iterrows():
           coordinates.append(row.XRot)
           coordinates.append(row.YRot)
           coordinates.append(row.ZRot)

    coordinates = np.reshape(coordinates, (size * 2, 63))
    return coordinates

In [5]:
def prepareForGrid(coordinates):
    coordinates = coordinates.reshape(-1, 21, 3)
    # coordinates in mm
    coordinates = np.around(coordinates * 1000)
    coordinates = coordinates.astype(np.int16)
    
    return coordinates

In [6]:
def add_to_hdf(hdf, df, set_name):
    groupName = set_name
    pGroup = None
    groupExists = "/" + groupName in hdf

    if not (groupExists):
        pGroup = hdf.create_group(groupName)
    else:
        pGroup = hdf["/" + groupName]
    
    # get labels
    log("Computing Label Coordinates")
    # coordinates in m
    labelCoordinates = getLabelCoordinates(df)
    # coordinates in mm
    preaparedData = prepareForGrid(labelCoordinates)
    
    
    # shuffle
    shuffle = np.arange(preaparedData.shape[0])
    np.random.shuffle(shuffle)
    labelCoordinates = labelCoordinates[shuffle]
    preaparedData = preaparedData[shuffle]
    
    if not (groupExists):
        pGroup.create_dataset("data", data=preaparedData, maxshape=(None, 21, 3), chunks=True)
        pGroup.create_dataset("labels", data=labelCoordinates, maxshape=(None, 63), chunks=True)
    else:
        
        pGroup["data"].resize((pGroup["data"].shape[0] + preaparedData.shape[0]), axis = 0)
        pGroup["data"][-preaparedData.shape[0]:] = preaparedData
        pGroup["labels"].resize((pGroup["labels"].shape[0] + labelCoordinates.shape[0]), axis = 0)
        pGroup["labels"][-labelCoordinates.shape[0]:] = labelCoordinates

# Creating HDF5 file

In [7]:
if Path(HDF5_PATH).is_file():
    print("Open storage for appending.")
    hdf = h5py.File(HDF5_PATH, mode='a')
else:
    print("Creating new storage for writing.")
    hdf = h5py.File(HDF5_PATH, mode='w')
    
training_participants = ["03", "05", "09", "11", "12", "13", "15", "16", "18", "19", "21"]
for i in training_participants:
    log("P" + str(i) + ": Reading dataframe for training.")
    df = pd.read_pickle(aggregated_data_path + "/" + str(i) + ".pkl")
    log("P" + str(i) + ": Adding to HDF.")
    add_to_hdf(hdf, df, "train")
    log("Inverting Y-Values")
    df['YRot'] = -df['YRot']
    add_to_hdf(hdf, df, "train")

test_participants = ["07", "06", "14"] # randomly picked from the list of participants
for i in test_participants:
    log("P" + str(i) + ": Reading dataframe for test.")
    df = pd.read_pickle(aggregated_data_path + "/" + str(i) + ".pkl")
    log("P" + str(i) + ": Adding to HDF.")
    add_to_hdf(hdf, df, "test")
    log("Inverting Y-Values")
    df['YRot'] = -df['YRot']
    add_to_hdf(hdf, df, "test")
        
validation_participants = ["08", "02", "04", "10"] # randomly picked from the list of participants
for i in validation_participants:
    log("P" + str(i) + ": Reading dataframe for validation.")
    df = pd.read_pickle(aggregated_data_path + "/" + str(i) + ".pkl")
    log("P" + str(i) + ": Adding to HDF.")
    add_to_hdf(hdf, df, "validation")
 
hdf.close()
log("Finished")  

Creating new storage for writing.
[2018-11-20 23:54:43.870380] P07: Reading dataframe for test.
[2018-11-20 23:54:45.391459] P07: Adding to HDF.
[2018-11-20 23:54:45.392599] Computing Label Coordinates
[2018-11-21 00:02:16.646143] Inverting Y-Values
[2018-11-21 00:02:16.686890] Computing Label Coordinates
[2018-11-21 00:09:44.662316] Finished


In [8]:
hdf = h5py.File(HDF5_PATH, mode='r')

In [9]:
print("Training samples:", hdf["train/data"].shape[0])
print("Test samples:", hdf["test/data"].shape[0])
print("Validation samples:", hdf["validation/data"].shape[0])
print("Total samples:", hdf["train/data"].shape[0]+hdf["test/data"].shape[0]+hdf["validation/data"].shape[0])

Training samples: 2083884
Test samples: 534252
Validation samples: 291766
Total samples: 2909902


In [10]:
hdf.close()