# Step 1: Load and Preprocess Data

In [1]:
%pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.17,>=2.16 (from tf-keras)
  Downloading tensorflow-2.16.2-cp312-cp312-macosx_10_15_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow<2.17,>=2.16->tf-keras)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow<2.17,>=2.16->tf-keras)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow<2.17,>=2.16->tf-keras)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow<2.17,>=2.16->tf-keras)
 

In [5]:
import pandas as pd
from datetime import datetime, timedelta
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [4]:
# IMAC
aruba_data_path = '/Users/harrisonkirstein/Documents/GitHub/CSCI-4380-Honors-Option-Repo/CSCI 4380 Honors Option Project/Datasets/aruba/data'


# LAPTOP
# aruba_data_path = '/Users/harrisonkirstein/Desktop/CSCI-4380-Honors-Option-Repo/CSCI 4380 Honors Option Project/Datasets/aruba/data'


# Load dataset with variable columns
aruba_data = pd.read_csv(
    aruba_data_path, 
    header=None, 
    names=['Date', 'Time', 'Sensor', 'Value', 'Activity', 'Begin_End'], 
    delim_whitespace=True,
    engine='python'
)

# Combine Date and Time into a single timestamp column
aruba_data['Date_Time'] = pd.to_datetime(aruba_data['Date'] + ' ' + aruba_data['Time'], errors='coerce')
aruba_data.drop(columns=['Date', 'Time'], inplace=True)

# Fill missing columns with NaN for rows without activity labels
aruba_data.fillna('', inplace=True)

# Preview the dataset
aruba_data.head()

  aruba_data = pd.read_csv(


Unnamed: 0,Sensor,Value,Activity,Begin_End,Date_Time
0,M003,ON,Sleeping,begin,2010-11-04 00:03:50.209589
1,M003,OFF,,,2010-11-04 00:03:57.399391
2,T002,21.5,,,2010-11-04 00:15:08.984841
3,T003,21,,,2010-11-04 00:30:19.185547
4,T004,21,,,2010-11-04 00:30:19.385336


In [5]:
aruba_data[aruba_data['Sensor'] == 'c']

Unnamed: 0,Sensor,Value,Activity,Begin_End,Date_Time
1476693,c,OFF,,,2011-05-10 18:42:45.169231


# Step 2: Create Activity Windows

In [6]:
# Group sensor data into activity windows
def create_activity_windows(data):
    windows = []
    current_window = []
    current_activity = None

    for _, row in data.iterrows():
        if 'begin' in row['Begin_End']:
            current_activity = row['Activity']
            current_window = []
        
        if current_activity:
            current_window.append(row)
        
        if 'end' in row['Begin_End']:
            if current_window:
                windows.append((current_activity, pd.DataFrame(current_window)))
            current_activity = None
            current_window = []

    return windows

activity_windows = create_activity_windows(aruba_data)
print(f"Number of activity windows: {len(activity_windows)}")

Number of activity windows: 6441


## Helper Functions for Generating TDOST Descriptions

In [7]:
def get_sensor_type(sensor_id):
    sensor_mapping = {
        'M': 'Motion',
        'D': 'Door',
        'T': 'Temperature',
        'L': 'Light',
        'I': 'Item'
    }
    # Extract the first character of the sensor ID to determine the type
    sensor_type_code = sensor_id[0]
    # Return the corresponding sensor type, or 'Unknown' if not mapped
    return sensor_mapping.get(sensor_type_code, 'Unknown')

In [8]:
def get_sensor_location(sensor_id):
    # Dictionary mapping sensor numbers to locations
    location_mapping = {
        'M001': 'Bedroom Near Closet',
        'M002': 'Bedroom',
        'M003': 'Bedroom',
        'M004': 'Bedroom near bathroom',
        'M005': 'Bedroom Hallway',
        'M006': 'Bedroom Door',
        'M007': 'Bedroom',
        'M008': 'Hallway',
        'M009': 'Living Room Couch',
        'M010': 'Living Room Couch',
        'M011': 'Front Door',
        'M012': 'Living Room',
        'M013': 'Living Room',
        'M014': 'Dining Room',
        'M015': 'Kitchen Stove',
        'M016': 'Back Door',
        'M017': 'Back Door',
        'M018': 'Kitchen Entry',
        'M019': 'Kitchen',
        'M021': 'Hallway 2',
        'M022': 'Hallway 3',
        'M023': 'Bedroom 2 Door',
        'M024': 'Bedroom 2',
        'M025': 'Office Tabke',
        'M026': 'Office Desk',
        'M027': 'Office',
        'M029': 'Bathroom Door',
        'M030': 'Garage Door',
        'M031': 'Housekeeping Closet',
        'D001': 'Front Door',
        'D002': 'Back Door',
        'D003': 'Closet Door',
        'D004': 'Garage Door',
        'T001': 'Bedroom',
        'T002': 'Living Room',
        'T003': 'Kitchen',
        'T004': 'Hallway'
    }
    
    
    # Get the location based on the numeric part, or default to "Other"
    return location_mapping.get(sensor_id, 'Other')


In [9]:
from datetime import datetime

def timestamp_to_words(timestamp):
    # Parse the timestamp

    # Ensure the input is a datetime object
    if isinstance(timestamp, pd.Timestamp):
        dt = timestamp.to_pydatetime()
    elif isinstance(timestamp, str):
        # Attempt to parse with milliseconds first
        try:
            dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
        except ValueError:
            # Fall back to parsing without milliseconds
            dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    else:
        raise TypeError("Unsupported timestamp type")
    
    # Extract hour, minute, and determine AM/PM
    hour = dt.hour
    minute = dt.minute
    period = "AM" if hour < 12 else "PM"
    
    # Adjust hour for 12-hour format
    hour = hour % 12 or 12  # 0 becomes 12 for AM/PM format

    # Convert hour and minute to words
    hour_text = num_to_words(hour)
    minute_text = num_to_words(minute)

    # Form the final text
    return f"{hour_text} hours {minute_text} minutes {period}"

def num_to_words(n):
    # Dictionary to convert numbers to words for 0-59
    words = {
        0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five",
        6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten",
        11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen",
        15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen",
        19: "nineteen", 20: "twenty", 21: "twenty-one", 22: "twenty-two",
        23: "twenty-three", 24: "twenty-four", 25: "twenty-five",
        26: "twenty-six", 27: "twenty-seven", 28: "twenty-eight",
        29: "twenty-nine", 30: "thirty", 31: "thirty-one", 32: "thirty-two",
        33: "thirty-three", 34: "thirty-four", 35: "thirty-five",
        36: "thirty-six", 37: "thirty-seven", 38: "thirty-eight",
        39: "thirty-nine", 40: "forty", 41: "forty-one", 42: "forty-two",
        43: "forty-three", 44: "forty-four", 45: "forty-five",
        46: "forty-six", 47: "forty-seven", 48: "forty-eight",
        49: "forty-nine", 50: "fifty", 51: "fifty-one", 52: "fifty-two",
        53: "fifty-three", 54: "fifty-four", 55: "fifty-five",
        56: "fifty-six", 57: "fifty-seven", 58: "fifty-eight",
        59: "fifty-nine"
    }
    return words.get(n, "")


# Step 3: Generate TDOST Descriptions for Each Event

In [10]:
# Generate TDOST for each event
def generate_tdost_basic(row):
    curr_sensor_time = timestamp_to_words(row['Date_Time'])
    # timestamp = row['Date_Time'].strftime('%H:%M:%S')
    curr_sensor_type = get_sensor_type(row['Sensor'])
    curr_sensor_location = get_sensor_location(row['Sensor'])

    curr_sensor_value = row['Sensor']
    if curr_sensor_value != 'ON' or curr_sensor_value != 'OFF':
        curr_sensor_value = num_to_words(curr_sensor_value)

    description = f"At approximately {curr_sensor_time}, {curr_sensor_type} sensor in {curr_sensor_location} fired with value {curr_sensor_value}."
    return description

def process_windows(windows):
    processed = []
    for activity, window in windows:
        window['TDOST_Basic'] = window.apply(generate_tdost_basic, axis=1)
        processed.append((activity, window))
    return processed

processed_windows = process_windows(activity_windows)
processed_windows[0][1].head()
# Show a sample processed window

Unnamed: 0,Sensor,Value,Activity,Begin_End,Date_Time,TDOST_Basic
0,M003,ON,Sleeping,begin,2010-11-04 00:03:50.209589,At approximately twelve hours three minutes AM...
1,M003,OFF,,,2010-11-04 00:03:57.399391,At approximately twelve hours three minutes AM...
2,T002,21.5,,,2010-11-04 00:15:08.984841,At approximately twelve hours fifteen minutes ...
3,T003,21,,,2010-11-04 00:30:19.185547,At approximately twelve hours thirty minutes A...
4,T004,21,,,2010-11-04 00:30:19.385336,At approximately twelve hours thirty minutes A...


In [11]:
processed_windows[0][1]['TDOST_Basic'].iloc[:5]

0    At approximately twelve hours three minutes AM...
1    At approximately twelve hours three minutes AM...
2    At approximately twelve hours fifteen minutes ...
3    At approximately twelve hours thirty minutes A...
4    At approximately twelve hours thirty minutes A...
Name: TDOST_Basic, dtype: object

# Step 4: Encode TDOST Descriptions and Labels

In [12]:
# Load pre-trained Sentence Transformer
sentence_model = SentenceTransformer('all-distilroberta-v1')

# Encode descriptions and labels
X, y = [], []
label_encoder = LabelEncoder()

windows_len = len(processed_windows)
count = 0
for activity, window in processed_windows:
    tdost_embeddings = sentence_model.encode(window['TDOST_Basic'].tolist())
    X.append(tdost_embeddings)
    y.append(activity)
    count = count + 1
    print(f"{count} / {windows_len}")

y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
print(f"Encoded {len(X)} windows with {num_classes} activity classes.")

1 / 6441
2 / 6441
3 / 6441
4 / 6441
5 / 6441
6 / 6441
7 / 6441
8 / 6441
9 / 6441
10 / 6441
11 / 6441
12 / 6441
13 / 6441
14 / 6441
15 / 6441
16 / 6441
17 / 6441
18 / 6441
19 / 6441
20 / 6441
21 / 6441
22 / 6441
23 / 6441
24 / 6441
25 / 6441
26 / 6441
27 / 6441
28 / 6441
29 / 6441
30 / 6441
31 / 6441
32 / 6441
33 / 6441
34 / 6441
35 / 6441
36 / 6441
37 / 6441
38 / 6441
39 / 6441
40 / 6441
41 / 6441
42 / 6441
43 / 6441
44 / 6441
45 / 6441
46 / 6441
47 / 6441
48 / 6441
49 / 6441
50 / 6441
51 / 6441
52 / 6441
53 / 6441
54 / 6441
55 / 6441
56 / 6441
57 / 6441
58 / 6441
59 / 6441
60 / 6441
61 / 6441
62 / 6441
63 / 6441
64 / 6441
65 / 6441
66 / 6441
67 / 6441
68 / 6441
69 / 6441
70 / 6441
71 / 6441
72 / 6441
73 / 6441
74 / 6441
75 / 6441
76 / 6441
77 / 6441
78 / 6441
79 / 6441
80 / 6441
81 / 6441
82 / 6441
83 / 6441
84 / 6441
85 / 6441
86 / 6441
87 / 6441
88 / 6441
89 / 6441
90 / 6441
91 / 6441
92 / 6441
93 / 6441
94 / 6441
95 / 6441
96 / 6441
97 / 6441
98 / 6441
99 / 6441
100 / 6441
101 / 64

In [14]:
X

[array([[-0.04054836, -0.02934076,  0.01642328, ...,  0.03683609,
          0.05173673, -0.02706542],
        [-0.04054836, -0.02934076,  0.01642328, ...,  0.03683609,
          0.05173673, -0.02706542],
        [-0.04490937, -0.07860418, -0.02047596, ...,  0.0761327 ,
          0.0381832 , -0.0164044 ],
        ...,
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952],
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952],
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952]], dtype=float32),
 array([[-0.02423953, -0.03504221,  0.00394174, ...,  0.03727894,
          0.04680564, -0.03333819],
        [-0.02466748, -0.03122601,  0.01941117, ...,  0.03855982,
          0.04740404, -0.03506247],
        [-0.04076015, -0.01877285,  0.0168035 , ...,  0.04539965,
          0.05319783, -0.02392951],
        ...,
        [-0.04639434, -0.0122307 ,  0.0

In [15]:
from joblib import dump

# Save variables
dump({'X': X, 'y': y, 'y_encoded': y_encoded, 'num_classes': num_classes}, 'processed_data.joblib')

print("Data saved successfully!")


Data saved successfully!


In [2]:
from joblib import load

# Load variables
data = load('processed_data.joblib')

X = data['X']
y = data['y']
y_encoded = data['y_encoded']
num_classes = data['num_classes']

print("Data loaded successfully!")


Data loaded successfully!


In [3]:
X

[array([[-0.04054836, -0.02934076,  0.01642328, ...,  0.03683609,
          0.05173673, -0.02706542],
        [-0.04054836, -0.02934076,  0.01642328, ...,  0.03683609,
          0.05173673, -0.02706542],
        [-0.04490937, -0.07860418, -0.02047596, ...,  0.0761327 ,
          0.0381832 , -0.0164044 ],
        ...,
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952],
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952],
        [-0.04076015, -0.01877285,  0.01680348, ...,  0.04539964,
          0.05319785, -0.02392952]], dtype=float32),
 array([[-0.02423953, -0.03504221,  0.00394174, ...,  0.03727894,
          0.04680564, -0.03333819],
        [-0.02466748, -0.03122601,  0.01941117, ...,  0.03855982,
          0.04740404, -0.03506247],
        [-0.04076015, -0.01877285,  0.0168035 , ...,  0.04539965,
          0.05319783, -0.02392951],
        ...,
        [-0.04639434, -0.0122307 ,  0.0

# Step 5: Pad Sequences and Split Data

In [2]:
%pip install tensorflow-macos

[31mERROR: Could not find a version that satisfies the requirement tensorflow-macos (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-macos[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to make them uniform length
max_length = max(len(seq) for seq in X)
X_padded = pad_sequences(X, maxlen=max_length, dtype='float32', padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Is GPU available:", tf.config.list_physical_devices('GPU'))

2024-11-20 21:01:16.291120: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.16.2
Is GPU available: []


In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.utils import to_categorical

In [8]:
# Convert labels to categorical
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# Step 6: Build and Train the LSTM Model

In [9]:

# Build model
model = Sequential([
    Bidirectional(LSTM(64, return_sequences=False), input_shape=(max_length, X_padded.shape[2])),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])


  super().__init__(**kwargs)


In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
# Train model
history = model.fit(X_train, y_train_cat, validation_data=(X_test, y_test_cat), epochs=20, batch_size=32)

Epoch 1/20
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6666 - loss: 1.1058

: 

# Step 7: Evaluate and Test the Model

In [None]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Test on new window (replace with real test window)
sample_window = X_test[0].reshape(1, max_length, -1)
prediction = model.predict(sample_window)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
print(f"Predicted Activity: {predicted_label[0]}")

In [1]:
X

NameError: name 'X' is not defined

In [2]:
X_padded

NameError: name 'X_padded' is not defined