In [85]:
GYRO = False
SEQUENCE_LENGTH = 5
SEQUENCE_OVERLAP = 3
BATCH_SIZE = 10
EPOCHS = 30
POSITION = "sitting_or_standing"
MODEL_NAME = f"position:{POSITION}_epochs:{EPOCHS}_batch:{BATCH_SIZE}_gyro:{GYRO}_window:{SEQUENCE_LENGTH}_overlap:{SEQUENCE_OVERLAP}"
DEV_SIZE = 0
TEST_SIZE = 15
LEAVE_ONE_OUT = False
SAVE_MODEL = False
NORMALIZE = True
OVERLAP_ON_TEST_SET = False
TRANSFORM = True
DIFFERENTIALS = True
GRADIENTS = True
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

### This code takes in a position, eg "sitting_or_standing", "lying_down_left" etc and trains a model for just classifying activities of that position

In [86]:
import helpers.file_tagger as file_tagger
import helpers.sequence_generator as sequence_generator
import helpers.split_by_student as split_by_student

import tensorflow as tf
import numpy as np
from keras import layers, Sequential, models, regularizers
import time


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


POSITIONS = [
    "sitting_or_standing",
    "lying_down_back",
    "lying_down_stomach",
    "lying_down_right",
    "lying_down_left"
]

STANDING_OR_SITTING_ACTIVITIES = [
    "sitting&coughing",
    "sitting&hyperventilating",
    "sitting&normal_breathing",
    
    "standing&coughing",
    "standing&hyperventilating",
    "standing&normal_breathing",
    
    

]

STANDING_OR_SITTING_OUTCOMES= [
    "sitting_or_standing&normal_breathing",
    "sitting_or_standing&coughing",
    "sitting_or_standing&hyperventilating",
    


]


LYING_DOWN_LEFT_ACTIVITIES = [
    "lying_down_left&coughing",
    "lying_down_left&hyperventilating",
    "lying_down_left&normal_breathing"

]

LYING_DOWN_LEFT_OUTCOMES = [
    "lying_down_left&normal_breathing",
    "lying_down_left&coughing",
    "lying_down_left&hyperventilating",

]



LYING_DOWN_RIGHT_ACTIVITIES = [
    "lying_down_right&normal_breathing",
    "lying_down_right&coughing",
    "lying_down_right&hyperventilating",

]

LYING_DOWN_RIGHT_OUTCOMES = [
    "lying_down_right&normal_breathing",
    "lying_down_right&coughing",
    "lying_down_right&hyperventilating"
    ]

LYING_DOWN_BACK_ACTIVITIES = [
    "lying_down_back&normal_breathing",
    "lying_down_back&coughing",
    "lying_down_back&hyperventilating",
    
]

LYING_DOWN_BACK_OUTCOMES = [
    "lying_down_back&normal_breathing",
    "lying_down_back&coughing",
    "lying_down_back&hyperventilating"
    
]

LYING_DOWN_STOMACH_ACTIVITIES = [
    "lying_down_stomach&normal_breathing",
    "lying_down_stomach&coughing",
    "lying_down_stomach&hyperventilating"
]

LYING_DOWN_STOMACH_OUTCOMES= [
    "lying_down_stomach&normal_breathing",
    "lying_down_stomach&coughing",
    "lying_down_stomach&hyperventilating"
]

DATA_DIRECTORY = "./all_respeck"

position_activities = {"sitting_or_standing": STANDING_OR_SITTING_ACTIVITIES,
                       "lying_down_left": LYING_DOWN_LEFT_ACTIVITIES,
                       "lying_down_right": LYING_DOWN_RIGHT_ACTIVITIES,
                       "lying_down_back": LYING_DOWN_BACK_ACTIVITIES,
                       "lying_down_stomach": LYING_DOWN_STOMACH_ACTIVITIES}

position_outcomes = {"sitting_or_standing": STANDING_OR_SITTING_OUTCOMES,
                        "lying_down_left": LYING_DOWN_LEFT_OUTCOMES,
                        "lying_down_right": LYING_DOWN_RIGHT_OUTCOMES,
                        "lying_down_back": LYING_DOWN_BACK_OUTCOMES,
                        "lying_down_stomach": LYING_DOWN_STOMACH_OUTCOMES}


POSSIBLE_ACTIVITIES = position_activities[POSITION]
POSSIBLE_OUTCOMES = position_outcomes[POSITION]
LABEL_TO_INDEX = {label: idx for idx, label in enumerate(POSSIBLE_OUTCOMES)}

if OVERLAP_ON_TEST_SET:
    TEST_SEQUENCE_OVERLAP = SEQUENCE_OVERLAP
else:
    TEST_SEQUENCE_OVERLAP = 0

In [87]:
def generate_training_data(directory, sequence_length, overlap, file_names, gyro = GYRO): # if gyro is false, only accelerometer data is used

    tagged_data = []

    # group each csv file into their respective areas
    csv_dictionary = file_tagger.tag_directory(directory)

    # iterates through each activity
    for key in POSSIBLE_ACTIVITIES:

        # iterates through each csv file for the activity 
        for csv_file in csv_dictionary[key]:
            if csv_file in file_names:
                if gyro:
                    sequences = sequence_generator.generate_sequences_from_file_with_gyroscope(directory + "/" + csv_file, sequence_length, overlap, normalise=NORMALIZE)
                else:
                    sequences = sequence_generator.generate_sequences_from_file_without_gyroscope(directory + "/" + csv_file, sequence_length, overlap, normalise=NORMALIZE)

                # iterate through each generated sequence
                for sequence in sequences:
                    position = key.split("&")[0]
                    activity = key.split("&")[1]

                    if activity == "talking" or activity == "singing" or activity == "laughing" or activity == "eating":
                        activity = "other"

                    if position == "standing" or position == "sitting":
                        position = "sitting_or_standing"
                        
                    tagged_data.append((position + "&" + activity, sequence))

    print ("there are " + str(len(tagged_data)) + " tagged sequences in the dataset")

    return tagged_data

In [88]:
def create_sequence_label_lists(tagged_sequences):
    sequences = [sequence for _, sequence in tagged_sequences]
    labels = [label for label, _ in tagged_sequences]
    sequences = np.array(sequences, dtype=np.float32)
    labels_encoded = [LABEL_TO_INDEX[label] for label in labels]
    labels = np.array(labels_encoded)

    return sequences, labels
    

def create_data_sets(dev_size, test_size):

    training_files, dev_files, test_files = split_by_student.split_data(students_in_dev_set= dev_size, students_in_test_set=test_size)

    tagged_training_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=training_files)
    tagged_dev_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, TEST_SEQUENCE_OVERLAP, file_names=dev_files)
    tagged_test_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, TEST_SEQUENCE_OVERLAP, file_names=test_files)

    train_data, train_labels = create_sequence_label_lists(tagged_training_sequences)
    dev_data, dev_labels = create_sequence_label_lists(tagged_dev_sequences)
    test_data, test_labels = create_sequence_label_lists(tagged_test_sequences)

    #print(len(train_data), len(train_labels), len(dev_data), len(dev_labels), len(test_data), len(test_labels))

    return train_data, train_labels, dev_data, dev_labels, test_data, test_labels

In [89]:
def fft(data):

    # Extract x, y, and z data
    x_data = data[:, 0]
    y_data = data[:, 1]
    z_data = data[:, 2]

    # Apply FFT to each axis
    x_fft = np.fft.fft(x_data)
    y_fft = np.fft.fft(y_data)
    z_fft = np.fft.fft(z_data)

    # The result is complex numbers, so you may want to take the magnitude
    x_magnitude = np.abs(x_fft)
    y_magnitude = np.abs(y_fft)
    z_magnitude = np.abs(z_fft)

    # If needed, you can also compute the corresponding frequencies
    # The frequencies are in cycles per time unit (usually, Hz if your time unit is seconds)
    x_frequencies = np.fft.fftfreq(len(x_data))
    y_frequencies = np.fft.fftfreq(len(y_data))
    z_frequencies = np.fft.fftfreq(len(z_data))

    representation = []
    for i in range(len(x_magnitude)):
        representation.append([x_magnitude[i], y_magnitude[i], z_magnitude[i]]) #, x_frequencies[i], y_frequencies[i], z_frequencies[i]])

    return representation

def extract_fft(train_data, dev_data, test_data):
    train_features = [fft(sequence) for sequence in train_data]
    dev_features = [fft(sequence) for sequence in dev_data]
    test_features = [fft(sequence) for sequence in test_data]

    return train_features, dev_features, test_features

def merge_arrays(arr1, arr2):
    return np.concatenate((arr1, arr2), axis=1)

def normalise(sequence):
    """
    Normalizes a matrix of accelerometer values.
    """
    norm = np.linalg.norm(sequence, axis=1)
    norm[norm == 0] = 1
    return sequence / norm[:, np.newaxis]

def differential(data):
    # Extract x, y, and z data
    x_data = data[:, 0]
    y_data = data[:, 1]
    z_data = data[:, 2]

    # Compute the differences between consecutive data points
    x_diff = np.diff(x_data)
    y_diff = np.diff(y_data)
    z_diff = np.diff(z_data)

    # Pad the differential variables to match the length of the data variables
    x_diff = np.pad(x_diff, (0, 1), mode='constant')
    y_diff = np.pad(y_diff, (0, 1), mode='constant')
    z_diff = np.pad(z_diff, (0, 1), mode='constant')

    # Combine the differential values into a representation
    representation = []
    for i in range(len(x_diff)):
        representation.append([x_diff[i], y_diff[i], z_diff[i]])

    return representation

def extract_differentials(train_data, dev_data, test_data):
    train_features = [differential(sequence) for sequence in train_data]
    dev_features = [differential(sequence) for sequence in dev_data]
    test_features = [differential(sequence) for sequence in test_data]

    return train_features, dev_features, test_features

def derivative(data):
    # Extract x, y, and z data
    x_data = data[:, 0]
    y_data = data[:, 1]
    z_data = data[:, 2]

    # Compute the derivative of the data
    x_derivative = np.gradient(x_data)
    y_derivative = np.gradient(y_data)
    z_derivative = np.gradient(z_data)

    # Combine the derivative values into a representation
    representation = []
    for i in range(len(x_derivative)):
        representation.append([x_derivative[i], y_derivative[i], z_derivative[i]])

    return representation

def extract_gradients(train_data, dev_data, test_data):
    train_features = [derivative(sequence) for sequence in train_data]
    dev_features = [derivative(sequence) for sequence in dev_data]
    test_features = [derivative(sequence) for sequence in test_data]

    return train_features, dev_features, test_features


In [90]:



train_data, train_labels, dev_data, dev_labels, test_data, test_labels = create_data_sets(dev_size=DEV_SIZE, test_size=TEST_SIZE)


if TRANSFORM:
    train_transform, dev_transform, test_transform = extract_fft(train_data, dev_data, test_data)
    train_data = np.array(train_data)
    train_transform = np.array(train_transform)
    test_data = np.array(test_data)
    test_transform = np.array(test_transform)

    train_data = np.array([merge_arrays(train_data[i], train_transform[i]) for i in range(len(train_data))])
    test_data = np.array([merge_arrays(test_data[i], test_transform[i]) for i in range(len(test_data))])

if DIFFERENTIALS:
    train_differential, dev_differential, test_differential = extract_differentials(train_data, dev_data, test_data)
    train_data = np.array(train_data)
    train_differential = np.array(train_differential)
    test_data = np.array(test_data)
    test_differential = np.array(test_differential)

    train_data = np.array([merge_arrays(train_data[i], train_differential[i]) for i in range(len(train_data))])
    test_data = np.array([merge_arrays(test_data[i], test_differential[i]) for i in range(len(test_data))])

if GRADIENTS:
    train_derivatives, dev_derivatives, test_derivatives = extract_gradients(train_data, dev_data, test_data)
    train_data = np.array(train_data)
    train_derivatives = np.array(train_derivatives)
    test_data = np.array(test_data)
    test_derivatives = np.array(test_derivatives)

    train_data = np.array([merge_arrays(train_data[i], train_derivatives[i]) for i in range(len(train_data))])
    test_data = np.array([merge_arrays(test_data[i], test_derivatives[i]) for i in range(len(test_data))])


X_train = [sequence.flatten() for sequence in train_data]
y_train = train_labels

X_test = [sequence.flatten() for sequence in test_data]
y_test = test_labels

# Create a Random Forest classifier with 100 trees (you can adjust the parameters as needed)
rf_classifier = RandomForestClassifier(n_estimators=100)

# Fit the Random Forest to the training data
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")

Train Set: s77, s32, s86, s64, s40, s18, s59, s29, s17, s51, s75, s43, s82, s54, s22, s67, s100, s71, s56, s9, s95, s36, s98, s80, s50, s15, s96, s102, s79, s38, s52, s87, s63, s5, s34, s13, s44, s48, s84, s3, s74, s7, s1, s30, s55, s70, s92, s39, s27, s33, s21, s72, s57, s61, s23, s35
Dev Set: 
Test Set: s11, s46, s16, s42, s60, s45, s91, s65, s83, s12, s97, s66, s8, s93, s88
there are 4376 tagged sequences in the dataset
there are 0 tagged sequences in the dataset
there are 515 tagged sequences in the dataset
Random Forest Accuracy: 0.8640776699029126
