#leotta_2021_load_dataset.ipynb
This data set loader uses the leotta_2021_get_X_y_sub.py file generated by downloading the python version of the same name Jupyter notebook.

It will perform a train/test (and optional validation) split and one-hot encode the activity labels.   Returns x/y_train and x/y_test numpy arrays that may be fed directly into a neural net model.

Example usage:

    x_train, y_train, x_test, y_test = leotta_2021_load_dataset()
  

Developed and tested using colab.research.google.com
IMPORTANT a high RAM runtime is required. Select runtime > change type > shape = high RAM  
To save as .py version use File > Download .py

Author:  [Lee B. Hinkle](https://userweb.cs.txstate.edu/~lbh31/), [IMICS Lab](https://imics.wp.txstate.edu/), Texas State University, 2021

<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.

TODOs:
* Very early version derived from e4_load_data.
* Removing component accel is very order specific - numpy column names?


In [3]:
#mount google drive in colab session
#enter path to where the git repo was cloned
my_path = '/content/drive/My Drive/Colab Notebooks/imics_lab_repositories/load_data_time_series_dev'

In [1]:
import os
import shutil #https://docs.python.org/3/library/shutil.html
from shutil import unpack_archive # to unzip
import requests #for downloading zip file
import numpy as np
from tabulate import tabulate # for verbose tables, showing data
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical # for one-hot encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
# use get_x_y_sub to get partially processed numpy arrays
full_filename = my_path+os.path.join('/ADL/Leotta_2021/'+'leotta_2021_get_x_y_sub.py')
shutil.copy(full_filename,'leotta_2021_get_x_y_sub.py')
from leotta_2021_get_x_y_sub import get_X_y_sub

In [15]:
def leotta_2021_load_dataset(
    verbose = True,
    incl_xyz_accel = False, # include component accel_x/y/z in ____X data
    incl_rms_accel = True, # add rms value (total accel) of accel_x/y/z in ____X data
    incl_val_group = False, # split train into train and validate
    split_subj = dict
                (train_subj = [1,2,7,8],
                validation_subj = [3,6],
                test_subj = [4,5]),
    one_hot_encode = True # make y into multi-column one-hot, one for each activity
    ):
    """calls e4_get_X_y_sub and processes the returned arrays by separating
    into _train, _validate, and _test arrays for X and y based on split_sub
    dictionary."""
    orig_zipfile = '/content/drive/My Drive/Datasets/ADL_Leotta_2021.zip'
    X, y, sub, xys_info = get_X_y_sub(orig_zipfile=orig_zipfile)
    log_info = 'Processing'+str(orig_zipfile)
    #remove component accel if needed
    if (not incl_xyz_accel):
        print("Removing component accel")
        X = np.delete(X, [0,1,2,4,5,6,8,9,10], 2)
    if (not incl_rms_accel):
        print("Removing total accel")
        X = np.delete(X, [3,7,11], 2)  
    #One-Hot-Encode y...there must be a better way when starting with strings
    #https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

    if (one_hot_encode):
        # integer encode
        y_vector = np.ravel(y) #encoder won't take column vector
        le = LabelEncoder()
        integer_encoded = le.fit_transform(y_vector) #convert from string to int
        name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print("One-hot-encoding: category names -> int -> one-hot")
        print(name_mapping) # seems risky as interim step before one-hot
        log_info += "One Hot:" + str(name_mapping) +"\n\n"
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        print("One-hot-encoding",onehot_encoder.categories_)
        y=onehot_encoded
        #return X,y
    # split by subject number pass in dictionary
    sub_num = np.ravel(sub[ : , 0] ) # convert shape to (1047,)
    if (not incl_val_group):
        train_index = np.nonzero(np.isin(sub_num, split_subj['train_subj'] + 
                                        split_subj['validation_subj']))
        x_train = X[train_index]
        y_train = y[train_index]
    else:
        train_index = np.nonzero(np.isin(sub_num, split_subj['train_subj']))
        x_train = X[train_index]
        y_train = y[train_index]

        validation_index = np.nonzero(np.isin(sub_num, split_subj['validation_subj']))
        x_validation = X[validation_index]
        y_validation = y[validation_index]

    test_index = np.nonzero(np.isin(sub_num, split_subj['test_subj']))
    x_test = X[test_index]
    y_test = y[test_index]
    if (incl_val_group):
        return x_train, y_train, x_validation, y_validation, x_test, y_test
    else:
        return x_train, y_train, x_test, y_test


        if(verbose):
            headers = ("Reshaped data","shape", "object type", "data type")
            mydata = [("x_train:", x_train.shape, type(x_train), x_train.dtype),
                    ("y_train:", y_train.shape ,type(y_train), y_train.dtype),
                    ("x_test:", x_test.shape, type(x_test), x_test.dtype),
                    ("y_test:", y_test.shape ,type(y_test), y_test.dtype)]
            print(tabulate(mydata, headers=headers))

        return x_train, y_train, x_test, y_test

In [16]:
if __name__ == "__main__":
    print("Downloading and processing Leotta 2021 dataset")
    x_train, y_train, x_test, y_test = leotta_2021_load_dataset()
    print("\nreturned arrays without validation group:")
    print("x_train shape ",x_train.shape," y_train shape ", y_train.shape)
    print("x_test shape  ",x_test.shape," y_test shape  ",y_test.shape)

    x_train, y_train, x_validation, y_validation, x_test, y_test = leotta_2021_load_dataset(incl_val_group=True)
    print("\nreturned arrays with validation group:")
    print("x_train shape ",x_train.shape," y_train shape ", y_train.shape)
    print("x_validation shape ",x_validation.shape," y_validation shape ", y_validation.shape)
    print("x_test shape  ",x_test.shape," y_test shape  ",y_test.shape)

Downloading and processing Leotta 2021 dataset
Unzipping Leotta 2021 dataset
Using source file /content/drive/My Drive/Datasets/ADL_Leotta_2021.zip
Processing subject number 1
Processing:  /content/dataset/ankle/ankle_X_01.csv /content/dataset/ankle/ankle_Y_01.csv
Processing:  /content/dataset/hip/hip_X_01.csv /content/dataset/hip/hip_Y_01.csv
Processing:  /content/dataset/wrist/wrist_X_01.csv /content/dataset/wrist/wrist_Y_01.csv
Resample: Original/New # rows =  780800 305000
confirmed label and sub match - dropping from ankle and hip
Using 12 features ['ankle_accel_x', 'ankle_accel_y', 'ankle_accel_z', 'ankle_accel_ttl', 'hip_accel_x', 'hip_accel_y', 'hip_accel_z', 'hip_accel_ttl', 'wrist_accel_x', 'wrist_accel_y', 'wrist_accel_z', 'wrist_accel_ttl']
No NaN entries found
None
Processing subject number 2
Processing:  /content/dataset/ankle/ankle_X_02.csv /content/dataset/ankle/ankle_Y_02.csv
Processing:  /content/dataset/hip/hip_X_02.csv /content/dataset/hip/hip_Y_02.csv
Processing:  