#load_data_transforms.ipynb

This is the common code that can be applied to all datasets after the conversion to the standard Intermediate Representation 1 (IR1) dataframe.

Set interactive to true to run the Jupyter Notebook version.  Note most of the calls are setup to test the functions, not process the entire dataset, to do that set interactive to false and run all so that main executes.   This notebook can be saved and run as a python file as well.


<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.

[Lee B. Hinkle](https://userweb.cs.txstate.edu/~lbh31/), Texas State University, [IMICS Lab](https://imics.wp.txstate.edu/)  
TODO:
* This is in-progress - current focus is on the Gesture dataset so testing will need to be done with the others.
* Issue with !gdown not running in a function is a pain.
* assign_ints_ir1_labels() seems to still return an int64 instead of int8


In [1]:
import os
import shutil #https://docs.python.org/3/library/shutil.html
from shutil import unpack_archive # to unzip
import time
import pandas as pd
import numpy as np
from numpy import savetxt
from tabulate import tabulate # for verbose tables, showing data
from tensorflow.keras.utils import to_categorical # for one-hot encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from time import gmtime, strftime, localtime #for displaying Linux UTC timestamps in hh:mm:ss
from datetime import datetime, date
import urllib.request # to get files from web w/o !wget
import matplotlib.pyplot as plt

# Global Parameters

In [2]:
# environment and execution parameters
my_dir = '.' # replace with absolute path if desired
dataset_dir = os.path.join(my_dir,'gesture_phase_dataset') # Where dataset will be unzipped

interactive = True # for exploring data and functions interactively
verbose = True

# dataset parameters
time_steps = 32
stride = 8

In [3]:
interactive = False # don't run if interactive, automatically runs for .py version
verbose = False # to limit the called functions output

# Get IR1 dataframes for interactive testing.

In [4]:
if interactive:
    print ("What?")
#Weird - gdown fails when called inside function.  Hack for now...

# !gdown "11OWxTejlTlR53s3RZbSNZdyMdFiN4dZl&confirm=t" # Gesture Phase Raw IR1s in zip
# shutil.unpack_archive('Gesture_Phase_Raw_IR1.zip', my_dir, 'zip')
# ir1_df = pd.read_pickle("a1_raw.pkl")
# # ir1_df.rename(columns={"phase": "label"}, inplace = True, errors="raise") # phase was GPS dataset specific
# # ir1_df.rename(columns={"subject": "sub"}, inplace = True, errors="raise") # subject versus sub too
# # ir1_df['sub'] = [ ord(x) - 96 for x in ir1_df['sub']] # ord is unicode char
# ir1_df.head()

What?
Downloading...
From: https://drive.google.com/uc?id=11OWxTejlTlR53s3RZbSNZdyMdFiN4dZl&confirm=t
To: /content/Gesture_Phase_Raw_IR1.zip
100% 644k/644k [00:00<00:00, 101MB/s]


Unnamed: 0_level_0,lhx,lhy,lhz,rhx,rhy,rhz,hx,hy,hz,sx,...,sz,lwx,lwy,lwz,rwx,rwy,rwz,label,sub,story
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-01 01:35:02.026,5.347435,4.363681,1.501913,5.258967,4.319263,1.488703,5.037871,1.618295,1.77835,5.062803,...,1.772577,4.972902,4.301065,1.564781,5.553945,4.370456,1.553521,Rest,1,1
1970-01-01 01:35:02.058,4.869622,4.25421,1.556133,5.240113,4.346338,1.554309,5.03761,1.61837,1.778573,5.06143,...,1.772859,4.974908,4.303656,1.565527,5.423875,4.303708,1.569942,Rest,1,1
1970-01-01 01:35:02.089,5.357447,4.364039,1.500969,5.238928,4.347924,1.55415,5.037514,1.618298,1.778774,5.059245,...,1.773568,4.981612,4.305363,1.563643,5.33217,4.438061,1.572841,Rest,1,1
1970-01-01 01:35:02.120,4.942886,4.281878,1.546513,5.111436,4.22966,1.527091,5.037526,1.618612,1.778855,5.056475,...,1.774519,4.987158,4.304063,1.565929,5.311104,4.396774,1.566368,Rest,1,1
1970-01-01 01:35:02.167,5.00316,4.27853,1.542866,4.985812,4.182155,1.52033,5.037557,1.619226,1.778925,5.052367,...,1.775536,4.983912,4.296833,1.569889,5.193762,4.335417,1.560144,Rest,1,1


# Shared transforms

In [8]:
def assign_ints_ir1_labels(df, label_mapping_dict):
    """Uses the mapping in the passed dictionary to assign integers to each
    string value predictably.  This is important because all labels may not
    be represented in each IR1 and strings take up too much room in IR2.
    Args:
        df - an IR1 dataframe with categorical label column
        label_mapping_dict - dict of dicts for each label column. See code.
    Returns:
        df - an updated IR1+ dataframe"""
    # Want to predictably convert the label strings into integers.
    # The sklearn label encoder is certainly an option but already have
    # a Pandas dataframe.   More importantly I want to encode the values
    # using all possible options not just the ones present in this particular
    # dataframe.   That means building a dictionary of the label mappings
    # which may even include labels not in the dataset at all, such as the
    # case with PSG-Audio.   Finally, I want to avoid ever having strings in the
    # numpy arrays - not an issue for small datasets but a big memory user
    # for larger ones.
    # Credit to this nice writeup https://pbpython.com/categorical-encoding.html
    if verbose:
        print("assign_ints_ir1_labels() converting categorical strings to ints")
        print("df['label'] value counts")
        print(df['label'].value_counts())
        if df['label'].dtype.name == 'category':
            dict( zip( df['label'].cat.codes, df['label'] ) ) # shows mapping Pandas is using.
    df = df.replace(label_mapping_dict)
    df['label']=df['label'].astype('int8') # TODO this only works with single label
    return df

if interactive:
    # This label mapping for Gesture-Phase-Segmentation dataset is in the order
    # of the readme.txt.  A second label entry can be added - see url above.
    label_map_gps = {"label":     {"Rest": 0, "Preparation": 1, "Stroke": 2,
                                   "Hold": 3, "Retraction": 4}}
    ir1_df = assign_ints_ir1_labels(ir1_df, label_mapping_dict = label_map_gps)

assign_ints_ir1_labels() converting categorical strings to ints
df['label'] value counts
Rest           698
Stroke         656
Retraction     191
Preparation    163
Hold            39
Name: label, dtype: Int64


In [10]:
def get_ir2_from_ir1(df):
    """slice the IR1 dataframe into sliding window segments of
    time_steps length and return X, y, sub ndarrays.
    If stride = time_steps there is no overlap of the sliding window.
    This version does not use append, better for RAM
    df: pandas datetime indexed dataframe columns - channel(s), label, sub
    Global params used
    time_steps: number of samples in window, will discard a partial final window
    stride:  how far to move window, no overlap if equal to time_steps.
    """    
    # this was copied from SHL with improved memory capabilities
    # TODO:  Update with multi-label version from PSG-Audio
    # the channel list is in dataframe but not in the numpy arrays
    channel_list = list(df.columns)
    channel_list.remove('label') # need to make sure this is defined for IR1
    channel_list.remove('sub') # ditto - should probably add a check
    if verbose:
        print('Channels in X:',channel_list)
    X = df[channel_list].to_numpy(dtype = 'float32')
    y = df['label'].to_numpy(dtype = 'int8') # doesn't work for strings
    #y = df['label'].to_numpy(dtype='<U10') # use assign_ints_ir1_labels first
    sub = df['sub'].to_numpy(dtype = 'int8')
    if verbose:
        print('X,y,sub array shapes before sliding window', X.shape, y.shape, sub.shape)
    #https://numpy.org/devdocs/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html
    shapex = (time_steps,X.shape[1]) # samples (rows to include) and n-dim of original (all channels)
    shapey = (time_steps,) # samples (rows to include) and only one column
    shapesub = (time_steps,) # samples (rows to include) and only one column
    X = np.lib.stride_tricks.sliding_window_view(X, shapex)[::stride, :]
    X = X[:,0,:,:] # I admit I don't understand why this dimension appears...
    y = np.lib.stride_tricks.sliding_window_view(y, shapey)[::stride, :]
    sub = np.lib.stride_tricks.sliding_window_view(sub, shapesub)[::stride, :]
    if verbose:
        print('X,y,sub array shapes after sliding window', X.shape, y.shape, sub.shape)
    return X, y, sub, channel_list
if interactive:
    my_X, my_y, my_sub, all_channel_list = get_ir2_from_ir1(ir1_df)
    headers = ("array","shape", "object type", "data type")
    mydata = [("my_X:", my_X.shape, type(my_X), my_X.dtype),
            ("my_y:", my_y.shape ,type(my_y), my_y.dtype),
            ("my_sub:", my_sub.shape, type(my_sub), my_sub.dtype)]
    print("IR2 array info")
    print(tabulate(mydata, headers=headers))
    print("Returned all_channel_list", all_channel_list)

Channels in X: ['lhx', 'lhy', 'lhz', 'rhx', 'rhy', 'rhz', 'hx', 'hy', 'hz', 'sx', 'sy', 'sz', 'lwx', 'lwy', 'lwz', 'rwx', 'rwy', 'rwz', 'story']
X,y,sub array shapes before sliding window (1747, 19) (1747,) (1747,)
X,y,sub array shapes after sliding window (215, 32, 19) (215, 32) (215, 32)
IR2 array info
array    shape          object type              data type
-------  -------------  -----------------------  -----------
my_X:    (215, 32, 19)  <class 'numpy.ndarray'>  float32
my_y:    (215, 32)      <class 'numpy.ndarray'>  int8
my_sub:  (215, 32)      <class 'numpy.ndarray'>  int8
Returned all_channel_list ['lhx', 'lhy', 'lhz', 'rhx', 'rhy', 'rhz', 'hx', 'hy', 'hz', 'sx', 'sy', 'sz', 'lwx', 'lwy', 'lwz', 'rwx', 'rwy', 'rwz', 'story']


In [11]:
def clean_ir2(X, y, sub):
    """removes sliding windows containing NaN, multiple labels, or multiple
    subject numbers.  Collapses y, sub to column arrays.
    Returns cleaned versions of X, y, sub ndarrays"""
    # Copied directly from SHL.  Yay!
    # Check for NaN
    nans = np.argwhere(np.isnan(X))
    num_nans = np.unique(nans[:,0]) #[:,0] just 1st column index of rows w/ NaN
    if verbose:
        print(num_nans.shape[0], "NaN entries found, removing")
    idx = ~np.isnan(X).any(axis=2).any(axis=1)
    # this warrants some explanation!
    # any(axis=1) and 2 collapses channels and samples
    # good axis explanation https://www.sharpsightlabs.com/blog/numpy-axes-explained/
    # the ~ negates so NaN location are now False in the idx which is then
    # used to filter out the bad windows below
    X = X[idx]
    y = y[idx]
    sub = sub[idx]
    # repeat and confirm NaNs have been removed
    nans = np.argwhere(np.isnan(X))
    num_nans = np.unique(nans[:,0]) #[:,0] accesses just 1st column
    if (nans.size!=0):
        print("WARNING! Cleaned output arrays still contain NaN entries")
        print("execute print(X[99]) # to view single sample")
    # Now get rid of segments with multiple labels
    # Not happy with this code, must be a better way but it seems to work...
    idx = []
    for i in range(y.shape[0]):
        if np.all(y[i] == y[i][0]):
            idx.append(True)
            
        else:
            idx.append(False)
            #print('Discarding Row:', i)
    X = X[idx]
    y = y[idx]
    sub = sub[idx]
    # TODO check for multiple subjects in window
    y = y[:,0] # collapse columns
    y = y[np.newaxis].T  # convert to single column array
    sub = sub[:,0] # repeat for sub array
    sub = sub[np.newaxis].T
    return X, y, sub
if interactive:
    my_X, my_y, my_sub = clean_ir2(my_X, my_y, my_sub)
    print('IR2 shapes after cleaning', my_X.shape, my_y.shape, my_sub.shape)
    headers = ("array","shape", "object type", "data type")
    mydata = [("my_X:", my_X.shape, type(my_X), my_X.dtype),
            ("my_y:", my_y.shape ,type(my_y), my_y.dtype),
            ("my_sub:", my_sub.shape, type(my_sub), my_sub.dtype)]
    print("IR2 array info")
    print(tabulate(mydata, headers=headers))

0 NaN entries found, removing
IR2 shapes after cleaning (101, 32, 19) (101, 1) (101, 1)
IR2 array info
array    shape          object type              data type
-------  -------------  -----------------------  -----------
my_X:    (101, 32, 19)  <class 'numpy.ndarray'>  float32
my_y:    (101, 1)       <class 'numpy.ndarray'>  int8
my_sub:  (101, 1)       <class 'numpy.ndarray'>  int8


In [12]:
def drop_label_ir2_ir3(X, y, sub, label_to_drop):
    """removes windows with label = label_to_drop
    This is primarily used to remove invalid windows, such as 'unknown' label
    Returns updated version of X, y, sub"""
    # Also copied directly from SHL - double Yay!
    idx = []
    for i in range(y.shape[0]):
        if (y[i] == label_to_drop):
            idx.append(False)
        else:
            idx.append(True)
            #print('Discarding Row:', i)
    X = X[idx]
    y = y[idx]
    sub = sub[idx]
    return X, y, sub
if interactive:
    print("Label counts before drop")
    unique, counts = np.unique(my_y, return_counts=True)
    print (np.asarray((unique, counts)).T)
    print('X, y, sub array shapes before label drop', my_X.shape, my_y.shape, my_sub.shape)
    my_X, my_y, my_sub = drop_label_ir2_ir3(my_X, my_y, my_sub, 'Undefined')
    print("Label counts after drop")
    unique, counts = np.unique(my_y, return_counts=True)
    print (np.asarray((unique, counts)).T)
    print('IR2 shapes after label drop', my_X.shape, my_y.shape, my_sub.shape)
    headers = ("array","shape", "object type", "data type")
    mydata = [("my_X:", my_X.shape, type(my_X), my_X.dtype),
            ("my_y:", my_y.shape ,type(my_y), my_y.dtype),
            ("my_sub:", my_sub.shape, type(my_sub), my_sub.dtype)]
    print("IR2 array info after label drop")
    print(tabulate(mydata, headers=headers))

Label counts before drop
[[ 0 56]
 [ 2 40]
 [ 4  5]]
X, y, sub array shapes before label drop (101, 32, 19) (101, 1) (101, 1)
Label counts after drop
[[ 0 56]
 [ 2 40]
 [ 4  5]]
IR2 shapes after label drop (101, 32, 19) (101, 1) (101, 1)
IR2 array info after label drop
array    shape          object type              data type
-------  -------------  -----------------------  -----------
my_X:    (101, 32, 19)  <class 'numpy.ndarray'>  float32
my_y:    (101, 1)       <class 'numpy.ndarray'>  int8
my_sub:  (101, 1)       <class 'numpy.ndarray'>  int8


  if (y[i] == label_to_drop):


In [8]:
def limit_channel_ir3(ir3_X, 
                      all_channel_list = ['accel_x', 'accel_y', 'accel_z', 'accel_ttl', 'bvp', 'eda', 'p_temp'],
                      keep_channel_list = ["accel_ttl"]):
    """Pass the full ir3_X array with all channels, the stored all_channel_list
    that was extracted from the ir1 dataframe column names, and a 
    keep_channel_list.  Matching channels will be kept, all others dropped.
    This would have been much easier at IR1 but that would precluded channel 
    experiments and by channel feature representations.
    This is really new code, I'm leaving in some commented statements for now"""
    ch_idx = []
    # should add check here for channels not in list
    for i in keep_channel_list:
        ch_idx.append(all_channel_list.index(i)) 
    if verbose:
        print("Keeping X columns at index", ch_idx)
    new_X = ir3_X[:,:,ch_idx]
    return new_X
if interactive:
    print("all_channel_list", all_channel_list)
    print("starting X shape", my_X.shape)
    print("first row", my_X[0,0,:])
    my_new_X = limit_channel_ir3(my_X,
                                 keep_channel_list = ['accel_ttl','p_temp'])
    print("ending X shape", my_new_X.shape)
    print("first row", my_new_X[0,0,:])