In [31]:
import helpers.file_tagger as file_tagger
import helpers.sequence_generator as sequence_generator
import helpers.split_by_student as split_by_student


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


In [32]:
GYRO = False
SEQUENCE_LENGTH = 4
SEQUENCE_OVERLAP = 2
TEST_SIZE = 7



ALL_ACTIVITIES = [

  "sitting&coughing",
    "sitting&hyperventilating",
    "sitting&normal_breathing",
    
    "standing&coughing",
    "standing&hyperventilating",
    "standing&normal_breathing",
    
    "sitting&talking",
    "sitting&eating",
    "sitting&singing",
    "sitting&laughing",

    "standing&talking",
    "standing&eating",
    "standing&singing",
    "standing&laughing",
    "lying_down_left&coughing",
    "lying_down_left&hyperventilating",
    "lying_down_left&talking",
    "lying_down_left&singing",
    "lying_down_left&laughing",
    "lying_down_left&normal_breathing",
    "lying_down_right&normal_breathing",
    "lying_down_right&coughing",
    "lying_down_right&hyperventilating",
    "lying_down_right&talking",
    "lying_down_right&singing",
    "lying_down_right&laughing",
    "lying_down_back&normal_breathing",
    "lying_down_back&coughing",
    "lying_down_back&hyperventilating",
    "lying_down_back&talking",
    "lying_down_back&singing",
    "lying_down_back&laughing",
    "lying_down_stomach&normal_breathing",
    "lying_down_stomach&coughing",
    "lying_down_stomach&hyperventilating",
    "lying_down_stomach&talking",
    "lying_down_stomach&singing",
    "lying_down_stomach&laughing",
]



ACTIVITIES = [
    "normal_breathing",
    "coughing",
    "hyperventilating",
    "other"
]








DATA_DIRECTORY = "./all_respeck"


LABEL_TO_INDEX = {label: idx for idx, label in enumerate(ACTIVITIES)}




In [33]:
# generate training data
def generate_training_data(directory, sequence_length, overlap, file_names, gyro = GYRO): # if gyro is false, only accelerometer data is used

    tagged_data = []

    # group each csv file into their respective areas
    csv_dictionary = file_tagger.tag_directory(directory)

    # iterates through each activity
    for key in POSSIBLE_ACTIVITIES:

        # iterates through each csv file for the activity 
        for csv_file in csv_dictionary[key]:
            if csv_file in file_names:
                if gyro:
                    sequences = sequence_generator.generate_sequences_from_file_with_gyroscope(directory + "/" + csv_file, sequence_length, overlap)
                else:
                    sequences = sequence_generator.generate_sequences_from_file_without_gyroscope(directory + "/" + csv_file, sequence_length, overlap)

                # iterate through each generated sequence
                for sequence in sequences:
                    activity = key.split("&")[1]

                    if activity == "talking" or activity == "singing" or activity == "laughing" or activity == "eating":
                        activity = "other"

                    tagged_data.append((activity, sequence))

    print ("there are " + str(len(tagged_data)) + " tagged sequences in the dataset")

    return tagged_data

In [34]:
def create_sequence_label_lists(tagged_sequences):
    sequences = [sequence for _, sequence in tagged_sequences]
    labels = [label for label, _ in tagged_sequences]
    sequences = np.array(sequences, dtype=np.float32)
    labels_encoded = [LABEL_TO_INDEX[label] for label in labels]
    labels = np.array(labels_encoded)

    return sequences, labels
    

def create_data_sets(dev_size, test_size):

    training_files, dev_files, test_files = split_by_student.split_data(students_in_dev_set= 0, students_in_test_set=test_size)

    tagged_training_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=training_files)
    tagged_test_sequences = generate_training_data(DATA_DIRECTORY, SEQUENCE_LENGTH, SEQUENCE_OVERLAP, file_names=test_files)

    train_data, train_labels = create_sequence_label_lists(tagged_training_sequences)
    test_data, test_labels = create_sequence_label_lists(tagged_test_sequences)

    #print(len(train_data), len(train_labels), len(dev_data), len(dev_labels), len(test_data), len(test_labels))

    return train_data, train_labels, test_data, test_labels

In [35]:
train_data, train_labels, test_data, test_labels = create_data_sets(0, TEST_SIZE)


X_train = [sequence.flatten() for sequence in train_data]
y_train = train_labels

X_test = [sequence.flatten() for sequence in test_data]
y_test = test_labels

# Create a Random Forest classifier with 100 trees (you can adjust the parameters as needed)
rf_classifier = RandomForestClassifier(n_estimators=1000)

# Fit the Random Forest to the training data
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")


Train Set: s60, s27, s83, s92, s16, s12, s84, s82, s102, s51, s98, s39, s15, s66, s88, s22, s86, s8, s44, s34, s93, s11, s79, s7, s59, s63, s52, s42, s35, s65, s9, s17, s80, s45, s77, s95, s57, s40, s87, s29, s56, s61, s5, s67, s64, s36, s72, s48, s96, s71, s13, s23, s21, s18, s100, s97, s75, s46, s50, s30, s38, s32, s55, s54
Dev Set: 
Test Set: s3, s74, s33, s1, s70, s43, s91
there are 5355 tagged sequences in the dataset
there are 541 tagged sequences in the dataset
Random Forest Accuracy: 0.45286506469500926
