In [1]:
GYRO = False
SEQUENCE_LENGTH = 4
SEQUENCE_OVERLAP = 3
BATCH_SIZE = 5
EPOCHS = 40
POSITION = "sitting_or_standing"
MODEL_NAME = f"position:{POSITION}_epochs:{EPOCHS}_batch:{BATCH_SIZE}_gyro:{GYRO}_window:{SEQUENCE_LENGTH}_overlap:{SEQUENCE_OVERLAP}"
DEV_SIZE = 7
TEST_SIZE = 7
LEAVE_ONE_OUT = False # true to test leave one out
SAVE_MODEL = False
NORMALIZE = False # normalize the data
RANDOM_TEST = False # test set is different every time
TEST_DEV_OVERLAP = True # overlap in sequences for test and dev set

### This code takes in a position, eg "sitting_or_standing", "lying_down_left" etc and trains a model for just classifying activities of that position

In [2]:
import file_tagger
import sequence_genrator
import tensorflow as tf
import numpy as np
from keras import layers, Sequential, models, Model, regularizers, callbacks
import split_by_student
import time
from keras.preprocessing.image import ImageDataGenerator



POSITIONS = [
    "sitting_or_standing",
    "lying_down_back",
    "lying_down_stomach",
    "lying_down_right",
    "lying_down_left"
]

STANDING_OR_SITTING_ACTIVITIES = [
    "sitting&coughing",
    "sitting&hyperventilating",
    "sitting&normal_breathing",
    
    "standing&coughing",
    "standing&hyperventilating",
    "standing&normal_breathing",
    
    "sitting&talking",
    "sitting&eating",
    "sitting&singing",
    "sitting&laughing",
    
    "standing&talking",
    "standing&eating",
    "standing&singing",
    "standing&laughing",

]

STANDING_OR_SITTING_OUTCOMES= [
    "sitting_or_standing&normal_breathing",
    "sitting_or_standing&coughing",
    "sitting_or_standing&hyperventilating",
    "sitting_or_standing&other"
    


]


LYING_DOWN_LEFT_ACTIVITIES = [
    "lying_down_left&coughing",
    "lying_down_left&hyperventilating",
    "lying_down_left&talking",
    "lying_down_left&singing",
    "lying_down_left&laughing",
    "lying_down_left&normal_breathing"

]

LYING_DOWN_LEFT_OUTCOMES = [
    "lying_down_left&normal_breathing",
    "lying_down_left&coughing",
    "lying_down_left&hyperventilating",
    "lying_down_left&other"

]



LYING_DOWN_RIGHT_ACTIVITIES = [
    "lying_down_right&normal_breathing",
    "lying_down_right&coughing",
    "lying_down_right&hyperventilating",
    "lying_down_right&talking",
    "lying_down_right&singing",
    "lying_down_right&laughing"
]

LYING_DOWN_RIGHT_OUTCOMES = [
    "lying_down_right&normal_breathing",
    "lying_down_right&coughing",
    "lying_down_right&hyperventilating",
    "lying_down_right&other"
]

LYING_DOWN_BACK_ACTIVITIES = [
    "lying_down_back&normal_breathing",
    "lying_down_back&coughing",
    "lying_down_back&hyperventilating",
    "lying_down_back&talking",
    "lying_down_back&singing",
    "lying_down_back&laughing",
]

LYING_DOWN_BACK_OUTCOMES = [
    "lying_down_back&normal_breathing",
    "lying_down_back&coughing",
    "lying_down_back&hyperventilating",
    "lying_down_back&other",
]

LYING_DOWN_STOMACH_ACTIVITIES = [
    "lying_down_stomach&normal_breathing",
    "lying_down_stomach&coughing",
    "lying_down_stomach&hyperventilating",
    "lying_down_stomach&talking",
    "lying_down_stomach&singing",
    "lying_down_stomach&laughing",
]

LYING_DOWN_STOMACH_OUTCOMES= [
    "lying_down_stomach&normal_breathing",
    "lying_down_stomach&coughing",
    "lying_down_stomach&hyperventilating",
    "lying_down_stomach&other",
]

DATA_DIRECTORY = "./all_respeck"

position_activities = {"sitting_or_standing": STANDING_OR_SITTING_ACTIVITIES,
                       "lying_down_left": LYING_DOWN_LEFT_ACTIVITIES,
                       "lying_down_right": LYING_DOWN_RIGHT_ACTIVITIES,
                       "lying_down_back": LYING_DOWN_BACK_ACTIVITIES,
                       "lying_down_stomach": LYING_DOWN_STOMACH_ACTIVITIES}

position_outcomes = {"sitting_or_standing": STANDING_OR_SITTING_OUTCOMES,
                        "lying_down_left": LYING_DOWN_LEFT_OUTCOMES,
                        "lying_down_right": LYING_DOWN_RIGHT_OUTCOMES,
                        "lying_down_back": LYING_DOWN_BACK_OUTCOMES,
                        "lying_down_stomach": LYING_DOWN_STOMACH_OUTCOMES}


POSSIBLE_ACTIVITIES = position_activities[POSITION]
POSSIBLE_OUTCOMES = position_outcomes[POSITION]
LABEL_TO_INDEX = {label: idx for idx, label in enumerate(POSSIBLE_OUTCOMES)}

if TEST_DEV_OVERLAP:
    TEST_OVERLAP_SIZE = 0
else:
    TEST_OVERLAP_SIZE = SEQUENCE_OVERLAP

2023-11-16 22:00:21.475213: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-16 22:00:21.519226: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-16 22:00:21.706305: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 22:00:21.706345: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 22:00:21.707307: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [3]:
def generate_training_data(directory, sequence_length, overlap, file_names, gyro = GYRO, normalise = NORMALIZE): # if gyro is false, only accelerometer data is used

    tagged_data = []

    # group each csv file into their respective areas
    csv_dictionary = file_tagger.tag_directory(directory)

    # iterates through each activity
    for key in POSSIBLE_ACTIVITIES:

        # iterates through each csv file for the activity 
        for csv_file in csv_dictionary[key]:
            if csv_file in file_names:
                if gyro:
                    sequences = sequence_genrator.generate_sequences_from_file_with_gyroscope(directory + "/" + csv_file, sequence_length, overlap, normalise)
                else:
                    sequences = sequence_genrator.generate_sequences_from_file_without_gyroscope(directory + "/" + csv_file, sequence_length, overlap, normalise)

                # iterate through each generated sequence
                for sequence in sequences:
                    position = key.split("&")[0]
                    activity = key.split("&")[1]

                    if activity == "talking" or activity == "singing" or activity == "laughing" or activity == "eating":
                        activity = "other"

                    if position == "standing" or position == "sitting":
                        position = "sitting_or_standing"
                        
                    tagged_data.append((position + "&" + activity, sequence))

    print ("there are " + str(len(tagged_data)) + " tagged sequences in the dataset")

    return tagged_data

In [4]:
def augment_data(input_data, labels, num_augmentations):
    """
    Augments the input data by applying random transformations to each sequence.
    Returns the augmented data and labels.
    """
    augmented_data = []
    augmented_labels = []
    
    # iterate through each sequence in the input data
    for i in range(len(input_data)):
        sequence = input_data[i]
        label = labels[i]
        
        # apply random transformations to each sequence
        for j in range(num_augmentations):
            # Randomly shift the sequence left or right
            shift = np.random.randint(-10, 10)
            shifted_sequence = np.roll(sequence, shift, axis=0)
            
            # Randomly flip the sequence horizontally
            if np.random.random() < 0.5:
                flipped_sequence = np.flip(shifted_sequence, axis=1)
            else:
                flipped_sequence = shifted_sequence
            
            # Randomly add noise to the sequence
            noise = np.random.normal(0, 0.1, size=sequence.shape)
            noisy_sequence = flipped_sequence + noise
            
            # add the augmented sequence and label to the output lists
            augmented_data.append(noisy_sequence)
            augmented_labels.append(label)
            
            # print the progress bar
            progress = (i * num_augmentations + j + 1) / (len(input_data) * num_augmentations) * 100
            print(f"[{'=' * int(progress // 5)}{' ' * (20 - int(progress // 5))}] {progress:.2f}%\r", end="")
    
    # convert the output lists to numpy arrays and return them
    return np.array(augmented_data), np.array(augmented_labels)



In [5]:
def train_model_CNN(input_data, labels_encoded, unique_labels, epochs, batch_size, validation_data):

    augmented_data, augmented_labels = augment_data(input_data, labels_encoded, 1)
    input_data = np.concatenate((input_data, augmented_data))
    labels_encoded = np.concatenate((labels_encoded, augmented_labels))
    
    if GYRO:
        width = 6
    else:
        width = 3
    # Define the CNN model for your specific input shape
    model = Sequential([
        layers.Conv1D(32, 3, activation='relu', input_shape=(SEQUENCE_LENGTH*25, width*2)),
        layers.MaxPooling1D(2),
        layers.Conv1D(64, 3, activation='relu'),
        layers.MaxPooling1D(2),
        #layers.Dropout(0.5),
        layers.Conv1D(128, 3, activation='relu'),
        layers.Dropout(0.5),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 3, activation='relu'),
        layers.MaxPooling1D(2),
        #layers.Conv1D(512, 3, activation='relu'),
        #layers.MaxPooling1D(2),
        layers.Dropout(0.5),
        layers.Flatten(),
        #layers.Dense(256, activation='relu'),
        #layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu', activity_regularizer=regularizers.l2(0.01)),
        layers.Dense(len(unique_labels), activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    # Train the CNN model
    if len(validation_data[0]) == 0:
        model.fit(input_data, labels_encoded, epochs=epochs, batch_size=batch_size)
    else:
        model.fit(input_data, labels_encoded, epochs=epochs, batch_size=batch_size, validation_data=validation_data, callbacks=[early_stopping])

    return model

In [6]:
def fft(data):

    # Extract x, y, and z data
    x_data = data[:, 0]
    y_data = data[:, 1]
    z_data = data[:, 2]

    # Apply FFT to each axis
    x_fft = np.fft.fft(x_data)
    y_fft = np.fft.fft(y_data)
    z_fft = np.fft.fft(z_data)

    # The result is complex numbers, so you may want to take the magnitude
    x_magnitude = np.abs(x_fft)
    y_magnitude = np.abs(y_fft)
    z_magnitude = np.abs(z_fft)

    # If needed, you can also compute the corresponding frequencies
    # The frequencies are in cycles per time unit (usually, Hz if your time unit is seconds)
    x_frequencies = np.fft.fftfreq(len(x_data))
    y_frequencies = np.fft.fftfreq(len(y_data))
    z_frequencies = np.fft.fftfreq(len(z_data))

    representation = []
    for i in range(len(x_magnitude)):
        representation.append([x_magnitude[i], y_magnitude[i], z_magnitude[i]])

    return representation

In [7]:
def extract_features(train_data, dev_data, test_data):
    train_features = [fft(sequence) for sequence in train_data]
    dev_features = [fft(sequence) for sequence in dev_data]
    test_features = [fft(sequence) for sequence in test_data]

    return train_features, dev_features, test_features


In [8]:
def create_sequence_label_lists(tagged_sequences):
    sequences = [sequence for _, sequence in tagged_sequences]
    labels = [label for label, _ in tagged_sequences]
    sequences = np.array(sequences, dtype=np.float32)
    labels_encoded = [LABEL_TO_INDEX[label] for label in labels]
    labels = np.array(labels_encoded)

    return sequences, labels
    

def create_data_sets(dev_size, test_size):

    training_files, dev_files, test_files = split_by_student.split_data(students_in_dev_set= dev_size, students_in_test_set=test_size,randomise=RANDOM_TEST)

    tagged_training_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=training_files)
    tagged_dev_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, TEST_OVERLAP_SIZE, file_names=dev_files)
    tagged_test_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, TEST_OVERLAP_SIZE, file_names=test_files)

    train_data, train_labels = create_sequence_label_lists(tagged_training_sequences)
    dev_data, dev_labels = create_sequence_label_lists(tagged_dev_sequences)
    test_data, test_labels = create_sequence_label_lists(tagged_test_sequences)


    #print(len(train_data), len(train_labels), len(dev_data), len(dev_labels), len(test_data), len(test_labels))

    return train_data, train_labels, dev_data, dev_labels, test_data, test_labels

In [9]:
def merge_arrays(arr1, arr2):
    return np.concatenate((arr1, arr2), axis=1)


In [10]:
"""def leave_one_out():
    students = split_by_student.get_list_of_stutents()
    test_accuracies = []
    for test_student in students:
        print("testing student: " + str(test_student))
        print("training students: " + str([student for student in students if student != test_student]))
        
        test_files, training_files = split_by_student.get_list_of_files(test_student)

        tagged_training_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=training_files)
        tagged_test_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=test_files)

        train_data, train_labels = create_sequence_label_lists(tagged_training_sequences)
        test_data, test_labels = create_sequence_label_lists(tagged_test_sequences)

        model = train_model_CNN(train_data, train_labels, POSSIBLE_OUTCOMES, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=([], [])) #batch_size, epochs
        test_loss, test_accuracy = model.evaluate(test_data, test_labels)
        test_accuracies.append(test_accuracy)
        print("for student " + str(test_student) + " the accuracy is " + str(test_accuracy))
        print("average accuracy so far: " + str(sum(test_accuracies)/len(test_accuracies)))
        print("number of students tested so far: " + str(len(test_accuracies)))
        time.sleep(3)

        
    print("Accuracy for each student:")
    print(", ".join([f"{student}: {accuracy}" for student, accuracy in zip(students, test_accuracies)]))
    print("Average overall accuracy:", sum(test_accuracies)/len(test_accuracies))
"""

'def leave_one_out():\n    students = split_by_student.get_list_of_stutents()\n    test_accuracies = []\n    for test_student in students:\n        print("testing student: " + str(test_student))\n        print("training students: " + str([student for student in students if student != test_student]))\n        \n        test_files, training_files = split_by_student.get_list_of_files(test_student)\n\n        tagged_training_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=training_files)\n        tagged_test_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=test_files)\n\n        train_data, train_labels = create_sequence_label_lists(tagged_training_sequences)\n        test_data, test_labels = create_sequence_label_lists(tagged_test_sequences)\n\n        model = train_model_CNN(train_data, train_labels, POSSIBLE_OUTCOMES, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=([], [])) #batch_siz

In [11]:

def normalise(sequence):
    """
    Normalizes a matrix of accelerometer values.
    """
    norm = np.linalg.norm(sequence, axis=1)
    norm[norm == 0] = 1
    return sequence / norm[:, np.newaxis]
    

In [12]:
#if LEAVE_ONE_OUT:
#    leave_one_out()
#    exit()


train_data, train_labels, dev_data, dev_labels, test_data, test_labels = create_data_sets(dev_size=DEV_SIZE, test_size=TEST_SIZE)

train_transform, dev_transform, test_transform = extract_features(train_data, dev_data, test_data)

train_data = [normalise(sequence) for sequence in train_data]
dev_data = [normalise(sequence) for sequence in dev_data]
test_data = [normalise(sequence) for sequence in test_data]

train_data = np.array(train_data)
train_transform = np.array(train_transform)
dev_data = np.array(dev_data)
dev_transform = np.array(dev_transform)
test_data = np.array(test_data)
test_transform = np.array(test_transform)




train_merged = np.array([merge_arrays(train_data[i], train_transform[i]) for i in range(len(train_data))])
dev_merged = np.array([merge_arrays(dev_data[i], dev_transform[i]) for i in range(len(dev_data))])
test_merged = np.array([merge_arrays(test_data[i], test_transform[i]) for i in range(len(test_data))])



print("Shape of train_data:", np.shape(train_data))
print("Shape of train_labels:", np.shape(train_labels))
print("Shape of train_transform:", np.shape(train_transform))
print("Shape of dev_data:", np.shape(dev_data))
print("Shape of dev_labels:", np.shape(dev_labels))
print("Shape of dev_transform:", np.shape(dev_transform))
print("Shape of test_data:", np.shape(test_data))
print("Shape of test_labels:", np.shape(test_labels))
print("Shape of test_transform:", np.shape(test_transform))





# train and save model (CHOOSE BETWEEN CNN AND LSTM)
model = train_model_CNN(train_merged, train_labels, POSSIBLE_OUTCOMES, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(dev_merged, dev_labels)) #batch_size, epochs


# Evaluate the model on the test set
if TEST_SIZE >0:
       test_loss, test_accuracy = model.evaluate(test_merged, test_labels)
       print (f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

print("labels by index:")
for label, index in LABEL_TO_INDEX.items():
    print(f"{index}: {label}")

#if SAVE_MODEL:
# Save the trained model
#    model.save(f"models/demo_models/lying_down_back_model.keras")

Train Set: s17, s84, s46, s96, s92, s45, s39, s72, s3, s9, s8, s71, s97, s80, s102, s35, s42, s11, s98, s12, s36, s7, s52, s100, s88, s33, s77, s34, s67, s1, s66, s57, s51, s27, s54, s38, s63, s59, s91, s30, s55, s74, s15, s56, s87, s21, s5, s86, s70, s43, s95, s13, s23, s82, s48, s65, s22
Dev Set: s40, s64, s29, s93, s44, s32, s79
Test Set: s18, s60, s83, s16, s61, s50, s75


there are 21410 tagged sequences in the dataset
there are 686 tagged sequences in the dataset
there are 686 tagged sequences in the dataset
