# Load X, y, sub numpy arrays, process, and train a 1D CNN
Loads X, y, sub numpy arrays. 

Processes into train, test, and validation if needed.

This code was used to run the frequency, kernel size, and
total versus component accel data experiments.

Author:  Lee B. Hinkle, [IMICS Lab](https://imics.wp.txstate.edu/), Texas State University, 2021

<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.

TODO:
* Timing should be inside model but must be passed back out

In [2]:
import os
import shutil #https://docs.python.org/3/library/shutil.html
from shutil import unpack_archive # to unzip
#from shutil import make_archive # to create zip for storage
import requests #for downloading zip file
import glob # to generate lists of files in directory - unix style pathnames
#from scipy import io #for loadmat, matlab conversion
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt # for plotting - pandas uses matplotlib
from tabulate import tabulate # for verbose tables
from keras.utils import to_categorical # for one-hot encoding
import matplotlib.pyplot as plt # for plotting training curves
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from tensorflow import keras #added to save model
from tensorflow.keras import layers #format matches MNIST example
# to measure and display training time
import time
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder # for one-hot encoding
from sklearn.preprocessing import OneHotEncoder # for one-hot encoding
#imports for computing and displaying output metrics
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

In [3]:
verbose = False #@param {type:"boolean"}


In [4]:
#create function to toggle print level.  Note: python logging may be better
#credit https://stackoverflow.com/users/416467/kindall
#https://stackoverflow.com/questions/5980042/how-to-implement-the-verbose-or-v-option-into-a-script
if verbose:
    print("Verbose mode on")
    def vprint(*args):
        # Print each argument separately so caller doesn't need to
        # stuff everything to be printed into a single string
        for arg in args:
           print (arg, end=" ")
        print()
else: 
    print("Verbose mode off")  
    vprint = lambda *a: None      # do-nothing function

Verbose mode off


In [5]:
#Helpful functions especially inside colab
from requests import get
def what_is_my_name():
    """returns the name of the running colab ipynb file"""
    #code is readily available on web - not original
    my_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name']
    return my_name
#start output file with unique name - important for colab runs
def start_logfile(base_name = "unnamed", first_line = "nothing here"):
    """creates file in result_dir, writes first_line, environment info.
    Returns full filename concatenation of path, base_name, current UTC time"""
    result_dir = '/content/drive/My Drive/Colab_Run_Results'
    timestamp = time.strftime('%b-%d-%Y_%H%M', time.localtime()) #UTC time
    log_fname = base_name +'_'+timestamp
    full_log_fname = result_dir+'/'+log_fname+'.txt'
    print("Starting text logfile ",full_log_fname)
    with open(full_log_fname, "w") as file_object:
        header = first_line + '\n'
        header += full_log_fname +'\n'
        header += 'Generated by ' + what_is_my_name() + '\n'
        cpu_model = !grep 'model name' /proc/cpuinfo
        header += 'CPU1: ' + cpu_model[0] + '\n'
        header += 'CPU2: ' + cpu_model[1] + '\n'
        gpu_info = !nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv
        header += 'GPU: ' + str(gpu_info[1]) + '\n'
        file_object.write(header)
    return full_log_fname
#Helper function since frequently checking and logging shapes
#credit https://stackoverflow.com/users/4944093/george-petrov for name method
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]
def get_shapes(np_arr_list):
    """Returns text, each line is shape and dtype for numpy array in list
       example: print(get_shapes([X_train, X_test, y_train, y_test]))"""
    shapes = ""
    for i in np_arr_list:
        my_name = namestr(i,globals())
        shapes += (my_name[0] + " shape is " + str(i.shape) \
            + " data type is " + str(i.dtype) + "\n")
    return shapes

In [6]:
def process_xysub (input_dir = 'not_set',use_xyz = False):
    """inputs X,y,sub numpy arrays from input_dir, use_xyz = true returns
    component acceleration, false returns total acceleration,
    one-hot encodes y"""
    process_info = 'arrays loaded from'
    process_info += input_dir + '\n'
    X = np.load(input_dir + '/' + 'X.npy')
    y = np.load(input_dir + '/' +'y.npy')
    sub = np.load(input_dir + '/' +'sub.npy')
    #TODO Load info file as well
    #No clue what I cannot call get_shapes inside method
    #generates list index out of range error, works fine in scratch cell
    #shapes = get_shapes([X,y,sub])
    #print(shapes)
    #process_info += "Starting shape of loaded files\n" + shapes
    # Drop either the three component accel or the total_accel
    # TODO should name numpy columns and drop by name for versatility
    # this assumes ['accel_x','accel_y','accel_z','total_accel']
    if (use_xyz):
        process_info += "Using xyz component accel, deleting total_accel from X\n"
        X = np.delete(X, 3, 2) # delete column 4 along axis 2
    else:
        process_info += "Using total component accel, deleting accel_x/y/z from X\n"
        X = np.delete(X, [0,1,2],2) # delete columns 1-3 along axis 2
    #One-Hot-Encode y...there must be a better way when starting with strings
    #https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

    if (y.shape[1]==1):
        # integer encode
        y_vector = np.ravel(y) #encoder won't take column vector
        le = LabelEncoder()
        integer_encoded = le.fit_transform(y_vector) #convert from string to int
        name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        process_info += "One Hot Encoding:" + str(name_mapping) +"\n"
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        y=onehot_encoded
    else:
        print ("y.shape[1] is not one, appears to be encoded already. Skipping")
    return X,y,sub,process_info

In [7]:
def split_sub_dict (X,y,sub):
    """inputs X,y,sub numpy arrays and splits into train, validate, and test
    based on the split_sub dict which is internal for now"""
    split_info = "manual split using pre-selected subjects\n"
    split_subj = {'train_subj':[2,4,5,9,10,16,18,20,23,24,26,27,28,32,34,35,
                                    36,38,42,45,46,47,48,49,50,51,52,53,54,57],
                    'validation_subj':[3,6,8,11,12,22,37,40,43,56],
                    'test_subj':[7,19,21,25,29,33,39,41,44,55]}

    train_index = np.empty([1],dtype=int) #empty list
    #vprint won't work with the end arg to suppress return
    #print("\nTrain Group adding subjects: ", end =" ")
    split_info += "\nTrain Group subjects: "
    for my_sub in split_subj['train_subj']:
        #print(my_sub, end =" ")
        split_info += str(my_sub) + ","
        #print(np.argwhere(sub == my_sub)[:,0])
        train_index = np.concatenate((train_index, np.argwhere(sub == my_sub)[:,0]))

    validation_index = np.empty([1],dtype=int) #empty list
    #print("\nValidation Group adding subjects:", end =" ")
    split_info += "\nValidation Group subjects: "
    for my_sub in split_subj['validation_subj']:
        #print(my_sub, end =" ")
        split_info += str(my_sub) + ","
        validation_index = np.concatenate((validation_index, np.argwhere(sub == my_sub)[:,0]))

    test_index = np.empty([1],dtype=int) #empty list
    #print("\nTest Group adding subjects:", end =" ")
    split_info += "\nTest Group subjects: "
    for my_sub in split_subj['test_subj']:
        #print(my_sub, end =" ")
        split_info += str(my_sub) + ","
        test_index = np.concatenate((test_index, np.argwhere(sub == my_sub)[:,0]))

    #print("\n")
    split_info += "\n"
    #delete first row placeholders
    train_index = np.delete(train_index, (0), axis=0) 
    validation_index = np.delete(validation_index, (0), axis=0) 
    test_index = np.delete(test_index, (0), axis=0)

    X_train, X_test, X_validation = X[train_index], X[test_index], X[validation_index]
    y_train, y_test, y_validation = y[train_index], y[test_index], y[validation_index]

    return  X_train, y_train, X_validation, y_validation, X_test, y_test, split_info

In [8]:
#def evaluate_model(trainX, trainy, validateX, validatey, testX, testy):
#different topology and good article here
#https://blog.goodaudience.com/introduction-to-1d-convolutional-neural-networks-in-keras-for-time-sequences-3a7ff801a2cf
def evaluate_model(trainX,trainy, validationX, validationy, 
                    batch_size=32, num_epochs=200, kernel_size = 20):
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
    model = keras.Sequential(
        [
        keras.Input(shape=(n_timesteps,n_features)),
        layers.Conv1D(filters=50, kernel_size = kernel_size, activation='relu'),
        layers.Conv1D(filters=50, kernel_size = kernel_size, activation='relu'),
        layers.Dropout(0.5),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(100, activation='relu'),
        layers.Dense(n_outputs, activation='softmax')
        ]
    )
    #model.summary()
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    start_time = time.time()
    history = model.fit(
        trainX,trainy,
        steps_per_epoch=trainX.shape[1]//batch_size,
        epochs=num_epochs,
        verbose = 0, #0 = silent, 1 = progress bar, 2 = one line per epoch.       
        validation_data=(validationX,validationy),
        validation_steps=validationX.shape[1]//batch_size)
    end_time = time.time()
    train_time = timedelta(seconds=(end_time - start_time))
    #print('Training time =',(np.str(train_time).split(".")[0]), 'HH:MM:SS')
    model.save('my_1D_CNN_model')
    return history

In [9]:
def run_model(testX, testy,batch_size = 32):
    model = keras.models.load_model('my_1D_CNN_model')
    start_time = time.time()
    predictions = model.predict(testX, verbose=0,batch_size=32)
    end_time = time.time()
    eval_time = timedelta(seconds=(end_time - start_time))
    #print('Eval time =',(np.str(eval_time).split(".")[0]), 'HH:MM:SS')

    #must use values not one-hot encoding, use argmax to convert
    y_pred = np.argmax(predictions, axis=-1) # axis=-1 means last axis
    y_test = np.argmax(testy, axis=-1)

    #print(classification_report(y_test, y_pred, target_names=ACT))
    #cm = confusion_matrix(y_test, y_pred)
    #print(cm)
    return (accuracy_score(y_test, y_pred))

In [11]:
#Just parking this here processing full list takes a several days on CPU instance
base_dir = '/content/drive/My Drive/Processed_Datasets/mobiact_xys/'
# filename, kernel size (1 second worth), batch size
worklist = [[base_dir + 'mobiact_xys_5Hz_3s', 5, 8],
          [base_dir + 'mobiact_xys_5Hz_3s', 5, 8], # first run has odd time
          [base_dir + 'mobiact_xys_10Hz_3s', 10, 8],
          [base_dir + 'mobiact_xys_15Hz_3s', 15, 8],
          [base_dir + 'mobiact_xys_20Hz_3s', 20, 8],
          [base_dir + 'mobiact_xys_25Hz_3s', 25, 8],
          [base_dir + 'mobiact_xys_30Hz_3s', 30, 8],
          [base_dir + 'mobiact_xys_40Hz_3s', 40, 8],
          [base_dir + 'mobiact_xys_60Hz_3s', 60, 8],
          [base_dir + 'mobiact_xys_80Hz_3s', 80, 8],
          [base_dir + 'mobiact_xys_100Hz_3s', 100, 8],
          [base_dir + 'mobiact_xys_no_resample_3s', 100, 8]]

worklist = [[base_dir + 'mobiact_xys_80Hz_3s', 80, 8],
          [base_dir + 'mobiact_xys_100Hz_3s', 100, 8],
          [base_dir + 'mobiact_xys_no_resample_3s', 100, 8]]

#The big loop

In [14]:
full_log_fname = start_logfile(base_name = "MobiAct_Resample_CPU", 
                                first_line = "resample batch size experiments, CPU instance, uniform batch size of 8")
base_dir = '/content/drive/My Drive/Processed_Datasets/mobiact_xys/'
ACT = ['JOG','JUM','STD','STN','STU','WAL'] #MobiAct specific
# filename, kernel size (1 second worth), batch size
worklist = [[base_dir + 'mobiact_xys_20Hz_3s', 20, 16]]

gpu_info = !nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv
print('GPU: ' + str(gpu_info[1]) + '\n')
for x in worklist:
    kernel_size = x[1]
    batch_size = x[2]
    num_epochs = 200
    pass_info = "--- processing " + str(x[0]) + "\n"
    pass_info += "--- batch_size = " + str(batch_size)
    pass_info += " kernel_size = " + str(kernel_size)
    pass_info += " num_epochs = " + str(num_epochs) + "\n"
    print (pass_info)
    with open(full_log_fname, "a") as file_object:
            file_object.write(pass_info)
    X,y,sub,src_info = process_xysub (input_dir = x[0], use_xyz = False)
    X_train, y_train, X_validation, y_validation, X_test, y_test, split_info = split_sub_dict (X,y,sub)
    #print(get_shapes([X_train, y_train, X_validation, y_validation, X_test, y_test]))

    repeats = 5;
    #for my_split in my_full_list:
    for i in range(1):
        for repeat_num in range(repeats):  #rerun current config
            # train model
            # timing should be in evaluate - but having issues passing back
            start_time = time.time()
            history = evaluate_model(X_train, y_train, X_validation, y_validation,
                                    batch_size, num_epochs, kernel_size)
            end_time = time.time()
            ttime = str(timedelta(seconds=(end_time - start_time)).total_seconds())
            # run model
            acc = run_model(X_test, y_test)
            if (repeat_num == 0):
                acc_string = "acc = ["+'{0:.3f}'.format(acc)
                ttime_string = "ttime = [" + ttime
            else:
                acc_string += ',' + '{0:.3f}'.format(acc)
                ttime_string += ',' + ttime
        acc_string += ']\n'
        ttime_string += ']\n'
        print(x[0])
        print (ttime_string)
        print (acc_string)
        with open(full_log_fname, "a") as file_object:
            file_object.write(ttime_string)
            file_object.write(acc_string) 

Starting text logfile  /content/drive/My Drive/Colab_Run_Results/MobiAct_Resample_CPU_Apr-16-2021_2024.txt
GPU: Tesla V100-SXM2-16GB, 460.32.03, 16160 MiB

--- processing /content/drive/My Drive/Processed_Datasets/mobiact_xys/mobiact_xys_20Hz_3s
--- batch_size = 16 kernel_size = 20 num_epochs = 200

INFO:tensorflow:Assets written to: my_1D_CNN_model/assets
INFO:tensorflow:Assets written to: my_1D_CNN_model/assets
INFO:tensorflow:Assets written to: my_1D_CNN_model/assets
INFO:tensorflow:Assets written to: my_1D_CNN_model/assets
INFO:tensorflow:Assets written to: my_1D_CNN_model/assets
/content/drive/My Drive/Processed_Datasets/mobiact_xys/mobiact_xys_20Hz_3s
ttime = [14.431182,13.847657,13.894559,13.920548,13.804549]

acc = [0.980,0.976,0.979,0.961,0.980]


Final Validation Accuracy: 0.993
