# TDOST-Based HAR Model using Aruba Dataset

This notebook implements a TDOST-based Human Activity Recognition (HAR) model. We will use the Aruba dataset from the CASAS repository as a starting point.

Steps:
1. Load and preprocess the dataset
2. Generate TDOST descriptions
3. Encode descriptions and labels
4. Train the HAR model
5. Test the model on new embeddings.

## Remove all occurances of 2 or more spaces from the data file

In [5]:
import re

aruba_data_path = '/Users/harrisonkirstein/Desktop/CSCI-4380-Honors-Option-Repo/CSCI 4380 Honors Option Project/Datasets/aruba/data'

def replace_two_spaces(file_path):
    # Create a temporary file to store the modified content
    temp_file_path = file_path + '.tmp'
    
    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
        for line in infile:
            # Replace multiple spaces with a single space in each line
            cleaned_line = re.sub(r'  ', ' ', line)
            outfile.write(cleaned_line)
    
    # Replace the original file with the modified file
    import os
    os.replace(temp_file_path, file_path)

def replace_three_spaces(file_path):
    # Create a temporary file to store the modified content
    temp_file_path = file_path + '.tmp'
    
    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
        for line in infile:
            # Replace multiple spaces with a single space in each line
            cleaned_line = re.sub(r'   ', ' ', line)
            outfile.write(cleaned_line)
    
    # Replace the original file with the modified file
    import os
    os.replace(temp_file_path, file_path)

def replace_four_spaces(file_path):
    # Create a temporary file to store the modified content
    temp_file_path = file_path + '.tmp'
    
    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
        for line in infile:
            # Replace multiple spaces with a single space in each line
            cleaned_line = re.sub(r'    ', ' ', line)
            outfile.write(cleaned_line)
    
    # Replace the original file with the modified file
    import os
    os.replace(temp_file_path, file_path)

replace_two_spaces(aruba_data_path)
print("Replaces two spaces")

replace_three_spaces(aruba_data_path)
print("Replaced three spaces")

replace_four_spaces(aruba_data_path)
print("Replaced four spaces")


Replaces two spaces
Replaced three spaces
Replaced four spaces


In [8]:
# Step 1: Load and Preprocess Dataset
import pandas as pd
import numpy as np

# Load the Aruba dataset

aruba_data_path = '/Users/harrisonkirstein/Desktop/CSCI-4380-Honors-Option-Repo/CSCI 4380 Honors Option Project/Datasets/aruba/data'

aruba_data = pd.read_csv(aruba_data_path, header=None, names=['Date', 'Time', 'Sensor', 'Value', 'Activity_Type', 'Begin_Or_End'], sep=' ')
# aruba_data['Timestamp'] = pd.to_datetime(aruba_data['Timestamp'])
aruba_data.head()

# Define a function to convert activities into the common set
def map_activities(activity):
    mapping = {
        'Meal_Preparation': 'Cook',
        'Relax': 'Relax',
        'Eating': 'Eat',
        'Work': 'Work',
        'Sleeping': 'Sleep',
        'Wash_Dishes': 'Other',
        'Bed_to_Toilet': 'Bed_to_Toilet',
        'Enter_Home': 'Enter_Home',
        'Leave_Home': 'Leave_Home',
        'Housekeeping': 'Other',
        'Resperate': 'Other',
    }
    return mapping.get(activity, 'Other')

# Apply mapping
aruba_data['Mapped_Activity'] = aruba_data['Activity_Type'].apply(map_activities)
aruba_data.head()


  aruba_data = pd.read_csv(aruba_data_path, header=None, names=['Date', 'Time', 'Sensor', 'Value', 'Activity_Type', 'Begin_Or_End'], sep=' ')


Unnamed: 0,Date,Time,Sensor,Value,Activity_Type,Begin_Or_End,Mapped_Activity
0,2010-11-04,00:03:50.209589,M003,ON,Sleeping,begin,Sleep
1,2010-11-04,00:03:57.399391,M003,OFF,,,Other
2,2010-11-04,00:15:08.984841,T002,21.5,,,Other
3,2010-11-04,00:30:19.185547,T003,21,,,Other
4,2010-11-04,00:30:19.385336,T004,21,,,Other


In [9]:
aruba_data['Sensor']

0          M003
1          M003
2          T002
3          T003
4          T004
           ... 
1719553     NaN
1719554     NaN
1719555     NaN
1719556     NaN
1719557     end
Name: Sensor, Length: 1719558, dtype: object

In [None]:
# Step 2: Generate TDOST Descriptions
# Generate textual descriptions for sensor events
def generate_tdost(row):
    template = "<Sensor_Type> sensor in <Location> fired with value <Value> at <Time>."
    location_mapping = {'M': 'Motion', 'D': 'Door', 'T': 'Temperature'}
    sensor_type = location_mapping.get(row['Sensor'][0], 'Unknown')
    description = template.replace("<Sensor_Type>", sensor_type)
    description = description.replace("<Location>", f"Location_{row['Sensor']}")
    description = description.replace("<Value>", str(row['Value']))
    description = description.replace("<Time>", row['Timestamp'].strftime("%H:%M:%S"))
    return description

aruba_data['TDOST_Description'] = aruba_data.apply(generate_tdost, axis=1)
aruba_data[['TDOST_Description', 'Mapped_Activity']].head()


In [None]:
# Step 3: Encode Descriptions and Labels
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder

# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for TDOST descriptions
tdost_embeddings = model.encode(aruba_data['TDOST_Description'].tolist())

# Encode activity labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(aruba_data['Mapped_Activity'])

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tdost_embeddings, encoded_labels, test_size=0.2, random_state=42)


In [None]:
# Step 4: Train HAR Model
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Define BiLSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out[:, -1, :])
        return logits

# Parameters
input_dim = 768
hidden_dim = 64
output_dim = len(label_encoder.classes_)
lr = 0.001
epochs = 10

# Model, Loss, Optimizer
model = BiLSTM(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")


In [None]:
# Step 5: Test HAR Model
from sklearn.metrics import classification_report

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, preds = torch.max(outputs, 1)
        y_true.extend(y_batch.numpy())
        y_pred.extend(preds.numpy())

# Print classification report
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
