<a href="https://colab.research.google.com/github/kjspring/stress-detection-wearable-devices/blob/main/modeling_WESAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load pickled data

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os

#! cd /content/drive/MyDrive/stress-prediction/ # Main file directory for this notebook on Google Drive
os.chdir('/content/drive/MyDrive/stress-prediction')
# absolute path of data directory
PATH = os.path.join(os.path.abspath(os.getcwd()), 'data') # Path of data folder on Google Drive

import joblib
data = joblib.load(f"{PATH}/pickle/WESAD_data_model.pickle") # read pickle file
labels = joblib.load(f"{PATH}/pickle/WESAD_labels_model.pickle") # read pickle file

Mounted at /content/drive


In [2]:
# Subsample the data and features
Hz_chest = 700
Hz_EDA = 4
subsample_rate = int(Hz_chest / Hz_EDA)  # subsample rate, e.g. to reduce from 700Hz to 4Hz

subsampled_data = data[::subsample_rate]
subsampled_labels = labels[::subsample_rate]

print(len(data))  # 2742499
print(len(subsampled_data))  # 15672

2742499
15672


In [19]:
# Check for NaN values
subsampled_data.isnull().values.any()

False

In [28]:
# Try with a Datagenerator to reduce the RAM use
from keras.preprocessing.sequence import TimeseriesGenerator
random_state = 42
sampling_rate = 5 # keep one data point out of 5
duration = 1 # how many minutes in the future the target after the end of the sequence
sequence_length = Hz_EDA * 60 * duration # observations will go back duration minutues
delay = sampling_rate*(sequence_length + duration*60*Hz_EDA - 1) # the target for a
                                                             # sequence will be
                                                             # duration (min)
                                                             # after the end of
                                                             # the sequence
batch_size = 64
shuffle = True

# Train Test Split
from sklearn.model_selection import train_test_split

X_dat, X_val, y_dat, y_val = train_test_split(subsampled_data, subsampled_labels, 
                                                 test_size = 0.2,
                                                 random_state=random_state)

X_train, X_test, y_train, y_test = train_test_split(X_dat, y_dat,
                                                    test_size = 0.2,
                                                    random_state = random_state)


# Normalize the data
from sklearn.preprocessing import StandardScaler
# create the StandardScaler object
scaler = StandardScaler()
# fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train.values.reshape(-1,1))
# transform the validation data
X_val_scaled = scaler.transform(X_val.values.reshape(-1,1))
# transform the test data
X_test_scaled = scaler.transform(X_test.values.reshape(-1,1))

# Data Generator
train_data_gen = TimeseriesGenerator(X_train_scaled, y_train, length=sequence_length, batch_size=batch_size)
val_data_gen = TimeseriesGenerator(X_val_scaled, y_val, length=sequence_length, batch_size=batch_size)
test_data_gen = TimeseriesGenerator(X_test_scaled, y_test, length=sequence_length, batch_size=batch_size)

In [29]:
print(train_data_gen[0][0].shape) # prints the batch size of the first entry
'''
A tensor of shape (32, 1200, 1) means that it is a 3-dimensional tensor with 32 
rows, 240 columns and 1 channel. In this specific case, it could represent a 
batch of 32 time series samples, each with 1200 time steps and 1 feature/channel.

The first dimension (32) represents the batch size, which is the number of 
samples that are processed at once during training. The second dimension (1200) 
represents the time steps or the sequence length of each sample, and the third 
dimension (1) represents the number of features or channels in each sample.
'''

(64, 240, 1)


'\nA tensor of shape (32, 1200, 1) means that it is a 3-dimensional tensor with 32 \nrows, 240 columns and 1 channel. In this specific case, it could represent a \nbatch of 32 time series samples, each with 1200 time steps and 1 feature/channel.\n\nThe first dimension (32) represents the batch size, which is the number of \nsamples that are processed at once during training. The second dimension (1200) \nrepresents the time steps or the sequence length of each sample, and the third \ndimension (1) represents the number of features or channels in each sample.\n'

In [36]:
train_iterator = iter(train_data_gen)
x_batch, y_batch = next(train_iterator)
print(x_batch.shape)
print(y_batch.shape)

# Validation Data
val_iterator = iter(val_data_gen)
x_batch, y_batch = next(val_iterator)
print(x_batch.shape)
print(y_batch.shape)

(64, 240, 1)
(64,)
(64, 240, 1)
(64,)


In [32]:
# Naive Model
def evaluate_naive_method(dataset):
  '''
  This method uses a common-sense approach to predict that the subject will be
  in the same state 5 minutes from now as they are in now.
  '''

In [37]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Define the LSTM model
model = Sequential()
#model.add(Dense(batch_size, activation='relu', input_shape=(sequence_length, 1)))
model.add(LSTM(64, 
               activation='relu', 
               #stateful = True, # To save RAM use,
               batch_input_shape=(batch_size, sequence_length, 1))) # Broke after adding Dense
               #unroll=True, # unroll the dropout to speed runtime
               #recurrent_dropout=0.5)) # Add dropout
model.add(Dropout(0.5)) # Dropout
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Callback and Early Stopping
#callbacks = [EarlyStopping(monitor='val_loss', patience=3),
#             ModelCheckpoint('models/LSTMmodel_best.keras',
#                  save_best_only=True)
#]

# Compile
model.compile(loss='binary_crossentropy', 
              optimizer=Adam(learning_rate=0.00001), 
              metrics=['binary_accuracy'])

# Train the model using the TimeSeriesGenerator
# train the model
epochs = 10
#total_epochs = 0
#for i in range(epochs):
#    for j in range(len(train_data_gen)):
#      print('Epoch', total_epochs+1, '/', epochs)
#      model.fit(train_data_gen, epochs=1, validation_data=val_data_gen, 
#                verbose=1, shuffle=False, callbacks=callbacks)
#      model.reset_states() # Need to reset since using LSTM stateful
#      total_epochs += 1





In [38]:
model.fit(train_data_gen, validation_data=val_data_gen,
          shuffle=False)#, callbacks=callbacks)



InvalidArgumentError: ignored

In [None]:
# Evaluate the model on the validation and test dataset

score = model.evaluate(val_data_gen)
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])


test_loss = model.evaluate(test_data_gen)
print('Test loss:', test_loss)
print('Test accuracy:', score[1])