In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

from tensorflow import set_random_seed
import pickle
import tensorflow as tf
import pandas as pd
import random
from keras.utils import np_utils, to_categorical
from collections import Counter
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle
import keras

###### Reading datafiles. 
This function creates dataFrames for train, test and validation datasets used for training the machine learning model. The read_data function takes two variables, i.e, the name of the participant ('GOTOV05) and the location of the accelerometer data. The validation_data contains 2 participants selected at random, while the participant name passed as a variable above is used as the test subject. The rest are used in training the model. 

In [None]:
def read_data(name, folder = '/data/gotov_data/geneActive/'):
    data_to_select_random = []
    for data in glob.glob(folder+'*csv'): 
        data = os.path.splitext(os.path.basename(data))[0]
        if data in [name, 'GOTOV03','GOTOV16', 'GOTOV23','GOTOV04', 
                    'GOTOV02','GOTOV19', 'GOTOV12']:
            continue
        else:
            data_to_select_random.append(data)
            
    validation_data = ['GOTOV30','GOTOV08', 'GOTOV33','GOTOV17', 
                        'GOTOV35','GOTOV31', 'GOTOV10','GOTOV21','GOTOV28', 
                        'GOTOV09','GOTOV18','GOTOV07','GOTOV20', 'GOTOV11', 
                        'GOTOV29', 'GOTOV27']
    
    validation_data = random.sample(validation_data, 2)
    
    df_train = pd.DataFrame()
    df_val = pd.DataFrame()
    
    for patient in data_to_select_random:
        df = pd.read_csv(folder+patient+".csv", header = 0, index_col = None, 
                         low_memory=False).dropna(axis=0, how='any')
        df['participant'] = patient
        
        if patient in validation_data:
            print('loading val file', patient)
            df_val = df_val.append(df)
        else:
            print('loading training file', patient)
            df_train = df_train.append(df)
            
    df_test = pd.read_csv(folder+name+".csv", header = 0, index_col = None, 
                          low_memory=False).dropna(axis=0, how='any')
    df_test['participant'] = name  
    cols = ['ankle_x', 'ankle_y', 'ankle_z', 'wrist_x', 'wrist_y', 'wrist_z', 
            'chest_x', 'chest_y', 'chest_z','time', 'labels', 'participant']
    df_train = df_train[col]
    df_val = df_val[col]
    df_test = df_test[col]
    print('Done creating DataFrame for all files')

    return df_train, df_val, df_test 

#### Standardization of the datasets. 
Here we standardize all the values to have a zero mean and a standard deviation of 1 using the standard scaler from sckit learn. 

In [None]:
def standardizing_data(Xtrain, Xval, Xtest): 
    print('Scaling data....')
    col = ['time','labels', 'participant']
    cols = ['ankle_x', 'ankle_y', 'ankle_z', 'wrist_x', 'wrist_y', 'wrist_z', 
            'chest_x', 'chest_y', 'chest_z','time', 'labels', 'participant']
    
    X_train = Xtrain.drop(col,  axis=1)
    y_train = Xtrain[col].values

    X_test = Xtest.drop(col,  axis=1)
    y_test = Xtest[col].values
    
    X_val = Xval.drop(col,  axis=1)
    y_val = Xval[col].values

    scaler = StandardScaler()

    X_train = X_train.values
    X_train = X_train.astype('float32')
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    X_test = X_test.values
    X_test = X_test.astype('float32')
    X_test = scaler.transform(X_test)
    
    X_val = X_val.values
    X_val = X_val.astype('float32')
    X_val = scaler.transform(X_val)

    X_train = pd.DataFrame(X_train)
    y_train = pd.DataFrame(y_train)
    X_train = pd.concat([X_train, y_train], axis=1)
    X_train.columns = cols

    X_val = pd.DataFrame(X_val)
    y_val = pd.DataFrame(y_val)
    X_val = pd.concat([X_val, y_val], axis=1)
    X_val.columns = cols
    
    X_test = pd.DataFrame(X_test)
    y_test = pd.DataFrame(y_test)
    X_test = pd.concat([X_test, y_test], axis=1)
    X_test.columns = cols

    print('Done scaling data....')
    return X_train, X_val, X_test

### Build final sequences
The sequence_builder function accepts the dataFrame returned by the standardizing_data function above and transforms the data inputs into sequences required for training the machine learning model. The to_sequences function is called within the sequence_builder function. It accepts 4 args, ie, the sequence size, the input features (x_values), target variable (y_label) and delta. 

In [1]:
def to_sequences(SEQUENCE_SIZE, X_values, y_label, delta, deltamax = 1):
    # Select maximum number of instances for the given sequence size
    idx_max = int(X_values.shape[0]/SEQUENCE_SIZE)*SEQUENCE_SIZE
    # Make sequences
    X_values = X_values[:idx_max].reshape(-1, SEQUENCE_SIZE, X_values.shape[1])
    
    # Check if deltaT > deltamax
    idx = np.array(delta[:idx_max]).reshape((-1,SEQUENCE_SIZE)).max(axis=1) < deltamax
    X_values = X_values[idx,:,:]
    
    y_label = [y_label[0]]*X_values.shape[0]

    return X_values, y_label

def sequence_builder(df_f):
    print('Building sequences.....')
    seqX, seqY = [], []
    df_f.index = pd.to_datetime(df_f.index, unit = "ms")
    
    for i in df_f['participant'].unique():
#         print(i)
        for j in df_f['labels'].unique():
            X_values = df_f.query("participant == '"+ i +"' and labels == '" + j + "'")
            if X_values.shape[0] == 0:
#                 print("Participant "+ i +" has no class '" + j + "'")
                continue
            deltaT = X_values.index.to_series().diff().dt.seconds.values
            deltaT[0] = 0
            y_label = X_values['labels']
            X_values = X_values.drop(['time', 'labels','participant'], axis=1).values
            x, y = to_sequences(200, X_values, y_label, deltaT, deltamax = 1)
            seqX.append(x)
            seqY.append(y)
    xv = np.vstack(seqX)
    yv = [item for sublist in seqY for item in sublist]
    print('Done building sequences....')
    return xv, yv


def encode_permute_data(X_train, y_train, X_val, y_val, X_test, y_test):
    print('Encoding data....')
    
    y_train = pd.Series(y_train)
    y_val= pd.Series(y_val)
    all_labels = pd.concat([y_train, y_val])
    
    encoder = LabelEncoder()
    encoder.fit(all_labels)
    
    y_train = encoder.transform(y_train)
    y_val = encoder.transform(y_val)
    y_test = encoder.transform(y_test)
    
    y_test = np_utils.to_categorical(y_test)
    y_val = np_utils.to_categorical(y_val)

    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
    return X_train, y_train, X_val, y_val, X_test, y_test, encoder

In [3]:
for i in range(5,37):
    if i in [12, 19, 23, 16]: 
        continue
    else: 
        if len(str(i)) == 1:
            name = 'GOTOV0'+str(i)
        else:
            name = 'GOTOV'+str(i)
    print('Build sequence for test patient', name)
    df_train, df_val, df_test = read_data(name)
    Xtrain, Xval, Xtest = standardizing_data(df_train, df_val, df_test)
    X_train, y_train = sequence_builder(Xtrain)
    X_val, y_val = sequence_builder(Xval)
    X_test, y_test = sequence_builder(Xtest)
    X_train, y_train, X_val, y_val, X_test, y_test, encoder = encode_permute_data(X_train, y_train, X_val, y_val, X_test, y_test)
    with open('/home/s1931628/'+folder+'/'+name+'.pkl', 'wb') as f: 
        pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test, encoder), f)