In [1]:
import pandas as pd

In [2]:
# Load the hh101.ann.txt file into a DataFrame
ann_data = pd.read_csv('/Users/harrisonkirstein/Documents/GitHub/CSCI-4380-Honors-Option-Repo/CSCI 4380 Honors Option Project/hh101/hh101.ann.txt', delimiter='\t', 
                       names=['Timestamp', 'Sensor_ID', 'Location', 'Zone', 'State', 'Sensor_Type', 'Activity'])


In [3]:
# Filter out rows where Location or Zone is 'Ignore'
ann_data_filtered = ann_data[(ann_data['Location'] != 'Ignore') & (ann_data['Zone'] != 'Ignore')]
ann_data_filtered

Unnamed: 0,Timestamp,Sensor_ID,Location,Zone,State,Sensor_Type,Activity
0,2012-07-20 10:38:54.512364,M001,OutsideDoor,Entry,ON,Control4-Motion,Step_Out
5,2012-07-20 10:38:59.541365,M001,OutsideDoor,Entry,OFF,Control4-Motion,Step_Out
7,2012-07-20 10:39:02.429671,M001,OutsideDoor,Entry,ON,Control4-Motion,Step_Out
10,2012-07-20 10:39:04.120443,M001,OutsideDoor,Entry,OFF,Control4-Motion,Step_Out
13,2012-07-20 10:39:36.167078,M001,OutsideDoor,Entry,ON,Control4-Motion,Step_Out
...,...,...,...,...,...,...,...
321452,2012-09-17 23:52:18.639950,M012,Bedroom,Bed,OFF,Control4-Motion,Other_Activity
321453,2012-09-17 23:53:10.229896,M012,Bedroom,Bed,ON,Control4-Motion,Other_Activity
321454,2012-09-17 23:53:12.127975,M012,Bedroom,Bed,OFF,Control4-Motion,Other_Activity
321455,2012-09-17 23:53:14.730646,M012,Bedroom,Bed,ON,Control4-Motion,Other_Activity


In [4]:

# Helper function to classify time of day
def get_time_period(hour):
    if 0 <= hour < 5:
        return "Night"
    elif 5 <= hour < 8:
        return "Early Morning"
    elif 8 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    elif 17 <= hour < 21:
        return "Evening"
    elif 21 <= hour < 24:
        return "Late Night"
    else:
        return "Unknown"


In [5]:
from datetime import datetime

def timestamp_to_words(timestamp):
    # Parse the timestamp

    # Attempt to parse with milliseconds first
    try:
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
    except ValueError:
        # Fall back to parsing without milliseconds
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
            
    # Extract hour, minute, and determine AM/PM
    hour = dt.hour
    minute = dt.minute
    period = "AM" if hour < 12 else "PM"
    
    # Adjust hour for 12-hour format
    hour = hour % 12 or 12  # 0 becomes 12 for AM/PM format

    # Convert hour and minute to words
    hour_text = num_to_words(hour)
    minute_text = num_to_words(minute)

    # Form the final text
    return f"{hour_text} hours {minute_text} minutes {period}"

def num_to_words(n):
    # Dictionary to convert numbers to words for 0-59
    words = {
        0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five",
        6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten",
        11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen",
        15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen",
        19: "nineteen", 20: "twenty", 21: "twenty-one", 22: "twenty-two",
        23: "twenty-three", 24: "twenty-four", 25: "twenty-five",
        26: "twenty-six", 27: "twenty-seven", 28: "twenty-eight",
        29: "twenty-nine", 30: "thirty", 31: "thirty-one", 32: "thirty-two",
        33: "thirty-three", 34: "thirty-four", 35: "thirty-five",
        36: "thirty-six", 37: "thirty-seven", 38: "thirty-eight",
        39: "thirty-nine", 40: "forty", 41: "forty-one", 42: "forty-two",
        43: "forty-three", 44: "forty-four", 45: "forty-five",
        46: "forty-six", 47: "forty-seven", 48: "forty-eight",
        49: "forty-nine", 50: "fifty", 51: "fifty-one", 52: "fifty-two",
        53: "fifty-three", 54: "fifty-four", 55: "fifty-five",
        56: "fifty-six", 57: "fifty-seven", 58: "fifty-eight",
        59: "fifty-nine"
    }
    return words.get(n, "")

In [6]:
def get_sensor_type(sensor_name):
    # Split by '-' and return the last part if there's a prefix
    return sensor_name.split('-')[-1]

In [7]:
len(ann_data_filtered)

222858

In [8]:
sample_size = 222858

ann_data_small_sample = ann_data_filtered.sample(n=sample_size, random_state=42)
ann_data_small_sample

Unnamed: 0,Timestamp,Sensor_ID,Location,Zone,State,Sensor_Type,Activity
269417,2012-09-10 07:25:46.488266,MA013,LivingRoom,LivingRoom,OFF,Control4-MotionArea,Other_Activity
53209,2012-07-29 00:43:47.322359,M009,Bedroom,Bedroom,OFF,Control4-Motion,Dress
152162,2012-08-19 12:17:05.704754,M009,Bedroom,Bedroom,OFF,Control4-Motion,Other_Activity
14527,2012-07-22 15:23:51.121630,M001,OutsideDoor,Entry,OFF,Control4-Motion,Leave_Home
174050,2012-08-24 03:03:01.944014,M012,Bedroom,Bed,ON,Control4-Motion,Sleep
...,...,...,...,...,...,...,...
174727,2012-08-24 05:51:34.499277,M012,Bedroom,Bed,ON,Control4-Motion,Dress
151004,2012-08-19 09:49:06.056731,M008,LivingRoom,Chair,OFF,Control4-Motion,Watch_TV
192514,2012-08-27 17:47:36.224039,MA016,Kitchen,Kitchen,OFF,Control4-MotionArea,Cook_Dinner
214063,2012-09-01 10:30:14.239079,M008,LivingRoom,Chair,ON,Control4-Motion,Watch_TV


In [9]:
def get_activities_list(row):
    return row['Activity']

activity_labels_list = []
for index, row in ann_data_small_sample.iterrows():
    activity_labels_list.append(get_activities_list(row))

activity_labels_list

['Other_Activity',
 'Dress',
 'Other_Activity',
 'Leave_Home',
 'Sleep',
 'Sleep',
 'Personal_Hygiene',
 'Other_Activity',
 'Cook_Breakfast',
 'Toilet',
 'Other_Activity',
 'Sleep_Out_Of_Bed',
 'Other_Activity',
 'Bathe',
 'Leave_Home',
 'Watch_TV',
 'Personal_Hygiene',
 'Other_Activity',
 'Other_Activity',
 'Cook_Breakfast',
 'Read',
 'Watch_TV',
 'Watch_TV',
 'Leave_Home',
 'Personal_Hygiene',
 'Watch_TV',
 'Other_Activity',
 'Personal_Hygiene',
 'Other_Activity',
 'Sleep',
 'Other_Activity',
 'Other_Activity',
 'Other_Activity',
 'Watch_TV',
 'Dress',
 'Other_Activity',
 'Other_Activity',
 'Other_Activity',
 'Cook_Breakfast',
 'Dress',
 'Other_Activity',
 'Personal_Hygiene',
 'Other_Activity',
 'Other_Activity',
 'Other_Activity',
 'Cook',
 'Watch_TV',
 'Watch_TV',
 'Bathe',
 'Other_Activity',
 'Watch_TV',
 'Toilet',
 'Eat_Breakfast',
 'Cook_Breakfast',
 'Cook_Breakfast',
 'Watch_TV',
 'Toilet',
 'Other_Activity',
 'Other_Activity',
 'Watch_TV',
 'Toilet',
 'Other_Activity',
 'Other

In [10]:

# Generate TDOST descriptions
def generate_tdost(row):
    curr_time = pd.to_datetime(row['Timestamp'])
    day_of_week = curr_time.strftime('%A')
    time_period = get_time_period(curr_time.hour)
    sensor_type = get_sensor_type(row['Sensor_Type'])
    location = row['Location']
    state = row['State']
    return f"On {day_of_week} during the {time_period}, the {sensor_type} sensor in the {location} recorded a state of {state}."

# Apply TDOST generation to each row

tdost_basic_descriptions = []
for index, row in ann_data_small_sample.iterrows():
    tdost_basic_descriptions.append(generate_tdost(row))

tdost_basic_descriptions

['On Monday during the Early Morning, the MotionArea sensor in the LivingRoom recorded a state of OFF.',
 'On Sunday during the Night, the Motion sensor in the Bedroom recorded a state of OFF.',
 'On Sunday during the Afternoon, the Motion sensor in the Bedroom recorded a state of OFF.',
 'On Sunday during the Afternoon, the Motion sensor in the OutsideDoor recorded a state of OFF.',
 'On Friday during the Night, the Motion sensor in the Bedroom recorded a state of ON.',
 'On Saturday during the Early Morning, the Motion sensor in the Bedroom recorded a state of ON.',
 'On Tuesday during the Late Night, the MotionArea sensor in the Bathroom recorded a state of OFF.',
 'On Friday during the Afternoon, the Motion sensor in the Kitchen recorded a state of ON.',
 'On Monday during the Morning, the Motion sensor in the Kitchen recorded a state of ON.',
 'On Monday during the Morning, the MotionArea sensor in the Bathroom recorded a state of ON.',
 'On Thursday during the Morning, the Motion

In [81]:
# Generate TDOST temporal descriptions
def generate_tdost_temporal(row):
    time_in_words = timestamp_to_words(row['Timestamp'])
    sensor_type = get_sensor_type(row['Sensor_Type'])
    location = row['Location']
    state = row['State']
    return f"{sensor_type} sensor in the {location} fired with value {state} at {time_in_words}."

# Apply TDOST generation to each row

tdost_temporal_descriptions = ann_data_small_sample.apply(generate_tdost_temporal, axis=1)
tdost_temporal_descriptions

269417    MotionArea sensor in the LivingRoom fired with...
53209     Motion sensor in the Bedroom fired with value ...
152162    Motion sensor in the Bedroom fired with value ...
14527     Motion sensor in the OutsideDoor fired with va...
174050    Motion sensor in the Bedroom fired with value ...
                                ...                        
185137    Motion sensor in the Kitchen fired with value ...
319604    Motion sensor in the LivingRoom fired with val...
188602    MotionArea sensor in the Bathroom fired with v...
50773     Motion sensor in the Bedroom fired with value ...
238901    MotionArea sensor in the Bathroom fired with v...
Length: 50000, dtype: object

In [11]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence-BERT model for generating embeddings
# model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('all-distilroberta-v1')


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:

# Encode TDOST descriptions
tdost_descriptions_embeddings = model.encode(tdost_basic_descriptions, batch_size=64, show_progress_bar=True)


# Encode labels for each TDOST description
encoded_labels = label_encoder.transform(activity_labels_list)  # Should have length of 50,000


# Now you can verify lengths
print("Length of TDOST Descriptions Embeddings:", len(tdost_descriptions_embeddings)) # Should be 50,000
print("Length of Encoded Labels:", len(encoded_labels))  # Should be 50,000


Batches:   0%|          | 0/3483 [00:00<?, ?it/s]

In [105]:
import torch


# Convert to PyTorch tensors
tdost_embeddings_tensor = torch.tensor(tdost_descriptions_embeddings)
encoded_labels_tensor = torch.tensor(encoded_labels)


In [106]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(tdost_embeddings_tensor, encoded_labels_tensor, test_size=0.2, random_state=42)

# Print the sizes of the training and validation sets
print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))


Training set size: 178286
Validation set size: 44572


In [107]:
# Add sequence length dimension
X_train = X_train.unsqueeze(1)  # Shape: (num_samples, 1, input_size)
X_val = X_val.unsqueeze(1)      # Shape: (num_samples, 1, input_size)

In [108]:
class BiLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTM, self).__init__()
        self.lstm = torch.nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last output
        return out

# Initialize the model
input_size = tdost_embeddings_tensor.shape[1]  # Embedding size (e.g., 768)
hidden_size = 64  # You can adjust this
output_size = len(label_encoder.classes_)  # Number of unique activities

model = BiLSTM(input_size, hidden_size, output_size)


In [109]:
# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [112]:
num_epochs = 650  # Number of epochs

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Clear previous gradients

    # Forward pass
    outputs = model(X_train.float())  # Pass the training data through the model
    loss = criterion(outputs, y_train)  # Calculate the loss

    # Backward pass and optimization
    loss.backward()  # Compute gradients
    optimizer.step()  # Update model parameters

    # Print the loss every epoch
    if (epoch + 1) % 1 == 0:  # Print every epoch
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/650], Loss: 1.9572
Epoch [2/650], Loss: 1.9444
Epoch [3/650], Loss: 1.9319
Epoch [4/650], Loss: 1.9195
Epoch [5/650], Loss: 1.9073
Epoch [6/650], Loss: 1.8953
Epoch [7/650], Loss: 1.8835
Epoch [8/650], Loss: 1.8720
Epoch [9/650], Loss: 1.8606
Epoch [10/650], Loss: 1.8496
Epoch [11/650], Loss: 1.8387
Epoch [12/650], Loss: 1.8281
Epoch [13/650], Loss: 1.8178
Epoch [14/650], Loss: 1.8077
Epoch [15/650], Loss: 1.7979
Epoch [16/650], Loss: 1.7883
Epoch [17/650], Loss: 1.7790
Epoch [18/650], Loss: 1.7700
Epoch [19/650], Loss: 1.7612
Epoch [20/650], Loss: 1.7527
Epoch [21/650], Loss: 1.7445
Epoch [22/650], Loss: 1.7365
Epoch [23/650], Loss: 1.7287
Epoch [24/650], Loss: 1.7212
Epoch [25/650], Loss: 1.7139
Epoch [26/650], Loss: 1.7069
Epoch [27/650], Loss: 1.7000
Epoch [28/650], Loss: 1.6934
Epoch [29/650], Loss: 1.6870
Epoch [30/650], Loss: 1.6807
Epoch [31/650], Loss: 1.6747
Epoch [32/650], Loss: 1.6688
Epoch [33/650], Loss: 1.6632
Epoch [34/650], Loss: 1.6576
Epoch [35/650], Loss: 1

In [113]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    val_outputs = model(X_val.float())
    _, predicted = torch.max(val_outputs.data, 1)

    # Calculate accuracy
    accuracy = (predicted == y_val).sum().item() / y_val.size(0)
    print(f'Validation Accuracy: {accuracy:.4f}')


Validation Accuracy: 0.6173
