In [None]:
import os
import shutil #https://docs.python.org/3/library/shutil.html
from shutil import unpack_archive # to unzip
#from shutil import make_archive # to create zip for storage
import requests #for downloading zip file
from scipy import io #for loadmat, matlab conversion
import pandas as pd
import numpy as np
import torch
#import matplotlib.pyplot as plt # for plotting - pandas uses matplotlib
from tabulate import tabulate # for verbose tables
from tensorflow.keras.utils import to_categorical # for one-hot encoding

In [None]:
#credit https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
#many other methods I tried failed to download the file properly
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

In [None]:
def unimib_load_dataset(
    verbose = True,
    incl_xyz_accel = False, #include component accel_x/y/z in ____X data
    incl_rms_accel = True, #add rms value (total accel) of accel_x/y/z in ____X data
    incl_val_group = False, #True => returns x/y_test, x/y_validation, x/y_train
                           #False => combine test & validation groups
    split_subj = dict
                (train_subj = [4,5,6,7,8,10,11,12,14,15,19,20,21,22,24,26,27,29],
                validation_subj = [1,9,16,23,25,28],
                test_subj = [2,3,13,17,18,30]),
    one_hot_encode = True):
    #Download and unzip original dataset
    if (not os.path.isfile('./UniMiB-SHAR.zip')):
        print("Downloading UniMiB-SHAR.zip file")
        #invoking the shell command fails when exported to .py file
        #redirect link https://www.dropbox.com/s/raw/x2fpfqj0bpf8ep6/UniMiB-SHAR.zip
        #!wget https://www.dropbox.com/s/x2fpfqj0bpf8ep6/UniMiB-SHAR.zip
        download_url('https://www.dropbox.com/s/raw/x2fpfqj0bpf8ep6/UniMiB-SHAR.zip','./UniMiB-SHAR.zip')
    if (not os.path.isdir('./UniMiB-SHAR')):
        shutil.unpack_archive('./UniMiB-SHAR.zip','.','zip')
    #Convert .mat files to numpy ndarrays
    path_in = './UniMiB-SHAR/data'
    #loadmat loads matlab files as dictionary, keys: header, version, globals, data
    adl_data = io.loadmat(path_in + '/adl_data.mat')['adl_data']
    adl_names = io.loadmat(path_in + '/adl_names.mat', chars_as_strings=True)['adl_names']
    adl_labels = io.loadmat(path_in + '/adl_labels.mat')['adl_labels']

    if(verbose):
        headers = ("Raw data","shape", "object type", "data type")
        mydata = [("adl_data:", adl_data.shape, type(adl_data), adl_data.dtype),
                ("adl_labels:", adl_labels.shape ,type(adl_labels), adl_labels.dtype),
                ("adl_names:", adl_names.shape, type(adl_names), adl_names.dtype)]
        print(tabulate(mydata, headers=headers))
    #Reshape data and compute total (rms) acceleration
    num_samples = 151
    #UniMiB SHAR has fixed size of 453 which is 151 accelX, 151 accely, 151 accelz
    adl_data = np.reshape(adl_data,(-1,num_samples,3), order='F') #uses Fortran order
    if (incl_rms_accel):
        rms_accel = np.sqrt((adl_data[:,:,0]**2) + (adl_data[:,:,1]**2) + (adl_data[:,:,2]**2))
        adl_data = np.dstack((adl_data,rms_accel))
    #remove component accel if needed
    if (not incl_xyz_accel):
        adl_data = np.delete(adl_data, [0,1,2], 2)
    if(verbose):
        headers = ("Reshaped data","shape", "object type", "data type")
        mydata = [("adl_data:", adl_data.shape, type(adl_data), adl_data.dtype),
                ("adl_labels:", adl_labels.shape ,type(adl_labels), adl_labels.dtype),
                ("adl_names:", adl_names.shape, type(adl_names), adl_names.dtype)]
        print(tabulate(mydata, headers=headers))
    #Split train/test sets, combine or make separate validation set
    #ref for this numpy gymnastics - find index of matching subject to sub_train/sub_test/sub_validate
    #https://numpy.org/doc/stable/reference/generated/numpy.isin.html


    act_num = (adl_labels[:,0])-1 #matlab source was 1 indexed, change to 0 indexed
    sub_num = (adl_labels[:,1]) #subject numbers are in column 1 of labels

    if (not incl_val_group):
        train_index = np.nonzero(np.isin(sub_num, split_subj['train_subj'] +
                                        split_subj['validation_subj']))
        x_train = adl_data[train_index]
        y_train = act_num[train_index]
    else:
        train_index = np.nonzero(np.isin(sub_num, split_subj['train_subj']))
        x_train = adl_data[train_index]
        y_train = act_num[train_index]

        validation_index = np.nonzero(np.isin(sub_num, split_subj['validation_subj']))
        x_validation = adl_data[validation_index]
        y_validation = act_num[validation_index]

    test_index = np.nonzero(np.isin(sub_num, split_subj['test_subj']))
    x_test = adl_data[test_index]
    y_test = act_num[test_index]

    if (verbose):
        print("x/y_train shape ",x_train.shape,y_train.shape)
        if (incl_val_group):
            print("x/y_validation shape ",x_validation.shape,y_validation.shape)
        print("x/y_test shape  ",x_test.shape,y_test.shape)
    #If selected one-hot encode y_* using keras to_categorical, reference:
    #https://keras.io/api/utils/python_utils/#to_categorical-function and
    #https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
    if (one_hot_encode):
        y_train = to_categorical(y_train, num_classes=9)
        if (incl_val_group):
            y_validation = to_categorical(y_validation, num_classes=9)
        y_test = to_categorical(y_test, num_classes=9)
        if (verbose):
            print("After one-hot encoding")
            print("x/y_train shape ",x_train.shape,y_train.shape)
            if (incl_val_group):
                print("x/y_validation shape ",x_validation.shape,y_validation.shape)
            print("x/y_test shape  ",x_test.shape,y_test.shape)
    if (incl_val_group):
        return x_train, y_train, x_validation, y_validation, x_test, y_test
    else:
        return x_train, y_train, x_test, y_test

In [None]:
if __name__ == "__main__":
    print("Downloading and processing UniMiB SHAR dataset, ADL Portion")
    x_train, y_train, x_val, y_val, x_test, y_test = unimib_load_dataset(incl_val_group=True, one_hot_encode = False)
    print("\nUniMiB SHAR returned arrays:")
    print("x_train shape ",x_train.shape," y_train shape ", y_train.shape)
    print("x_test shape  ",x_test.shape," y_test shape  ",y_test.shape)

Downloading and processing UniMiB SHAR dataset, ADL Portion
Raw data     shape        object type              data type
-----------  -----------  -----------------------  -----------
adl_data:    (7579, 453)  <class 'numpy.ndarray'>  float64
adl_labels:  (7579, 3)    <class 'numpy.ndarray'>  uint8
adl_names:   (9, 1)       <class 'numpy.ndarray'>  object
Reshaped data    shape           object type              data type
---------------  --------------  -----------------------  -----------
adl_data:        (7579, 151, 1)  <class 'numpy.ndarray'>  float64
adl_labels:      (7579, 3)       <class 'numpy.ndarray'>  uint8
adl_names:       (9, 1)          <class 'numpy.ndarray'>  object
x/y_train shape  (4601, 151, 1) (4601,)
x/y_validation shape  (1454, 151, 1) (1454,)
x/y_test shape   (1524, 151, 1) (1524,)

UniMiB SHAR returned arrays:
x_train shape  (4601, 151, 1)  y_train shape  (4601,)
x_test shape   (1524, 151, 1)  y_test shape   (1524,)


In [None]:
x_train_df=pd.DataFrame(x_train.reshape(x_train.shape[0], x_train.shape[1]))
x_val_df=pd.DataFrame(x_val.reshape(x_val.shape[0], x_val.shape[1]))
x_test_df=pd.DataFrame(x_test.reshape(x_test.shape[0], x_test.shape[1]))
y_train=pd.DataFrame(y_train)
y_val=pd.DataFrame(y_val)
y_test=pd.DataFrame(y_test)

## <span style='font-family:"Times New Roman"'><font color='Blue'> **For the Embedding**

Any of the functions discussed in the paper can be used here.

In [None]:
# Now you can import the script
import scripts.embd_fgit as embd
import scripts.clasfy_p1 as clasfy
import scripts.clasfy_p2 as clasfy2
import pandas as pd
import numpy as np


In [None]:
train_sct, val_sct, test_sct = embd.std_scaling(x_train_df, x_val_df, x_test_df)

## For the PCA Embedding

In [None]:
train_emb, val_emb, test_emb = embd.pca_embedding(train_sct, val_sct, test_sct,96)

In [None]:
#to save and load the work later on,
np.savetxt('um_pca_embeddings.txt', train_emb)
np.savetxt('um_pca_embeddings.txt', test_emb)
np.savetxt('um_pca_embeddings.txt', val_emb)

# Another example, the TDA embedding method

In [None]:
train_sct, val_sct, test_sct = embd.minmax_scaling(x_train_df, x_val_df, x_test_df)

In [None]:
train_emb, val_emb, test_emb = embd.TDA_embedding(train_sct, val_sct, test_sct)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tensor = torch.tensor([1, 2, 3]).to(device)

# Print out the device
print("Device:", tensor.device)

Device: cuda:0


In [None]:
import warnings
warnings.filterwarnings('ignore')

## <span style='font-family:"Times New Roman"'><font color='Blue'> **For the classification**

For this purpose, we use the classification script, which would run 100 trials with optuna to tune the parameters based on the validation accuracy, and return the best parameters, running time, alongside the classification accuracy and report .txt file named by the system time.

In [None]:
ny_train=y_train+1
ny_val=y_val+1
ny_test=y_test+1

In [None]:
best_params, best_score = clasfy.optimize_RF(train_emb, val_emb, test_emb,ny_train, ny_val, ny_test)
print(best_params, best_score)

[I 2024-08-24 03:20:42,047] A new study created in memory with name: no-name-091ef487-20c1-4067-bce7-33db69b2615e
[I 2024-08-24 03:20:44,157] Trial 0 finished with value: 0.6746905089408528 and parameters: {'n_estimators': 169, 'max_depth': 6}. Best is trial 0 with value: 0.6746905089408528.
[I 2024-08-24 03:20:45,043] Trial 1 finished with value: 0.6939477303988996 and parameters: {'n_estimators': 40, 'max_depth': 31}. Best is trial 1 with value: 0.6939477303988996.
[I 2024-08-24 03:20:48,732] Trial 2 finished with value: 0.7028885832187071 and parameters: {'n_estimators': 176, 'max_depth': 14}. Best is trial 2 with value: 0.7028885832187071.
[I 2024-08-24 03:20:50,057] Trial 3 finished with value: 0.6499312242090785 and parameters: {'n_estimators': 124, 'max_depth': 5}. Best is trial 2 with value: 0.7028885832187071.
[I 2024-08-24 03:20:51,836] Trial 4 finished with value: 0.702200825309491 and parameters: {'n_estimators': 80, 'max_depth': 29}. Best is trial 2 with value: 0.702888583

3.0554051399230957  seconds
Classification report saved as classification_report_2024-08-24_03-26-03.txt
{'n_estimators': 138, 'max_depth': 23} 0.681758530183727
