# Installing Dependencies

In [1]:
pip -q install --upgrade ruptures scikit-learn tqdm matplotlib joblib

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at e:\swe\python3.11.5\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [2]:
import os
import zipfile
import shutil
from datetime import date, datetime, timedelta
import datetime as dt
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from scipy.stats import kurtosis, skew
from scipy.signal import find_peaks

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

import ruptures as rpt

from tqdm import tqdm
tqdm.pandas()

# Data Loading

### Setup Folders for Data Processing

In [3]:
model_training_data_path = 'Model_Training_Data/'
model_path = 'Models/'

In [4]:
os.makedirs(model_training_data_path, exist_ok=True)

### Load CSV Files

In [5]:
def all_data_csv(base_path):
    combined_df_original = pd.DataFrame()  # Initialize an empty DataFrame

    # Check if the folder exists
    if os.path.exists(base_path) and os.path.isdir(base_path):
        
        all_files = os.listdir(base_path)

        # Filter only CSV files
        csv_files = [file for file in all_files if file.endswith('.CSV')]

        for file in tqdm(csv_files, desc="Processing Files"):
            df_original = pd.read_csv(os.path.join(base_path, file))

            # Select desired columns
            selected_columns = ['Object Temperature(C)', 'Heart Rate Ear(BPM)', 'GSR', 'Stress_Score']
            df_original = df_original[selected_columns]

            # Concatenate into the combined DataFrame
            combined_df_original = pd.concat([combined_df_original, df_original], ignore_index=True)

    # Rename the columns
    combined_df_original = combined_df_original.rename(columns={
        'Object Temperature(C)': 'TEMP',
        'Heart Rate Ear(BPM)': 'HR',
        'GSR': 'EDA',
        'Stress_Score': 'STRESS_SCORE'
    })

    # Export the combined DataFrame to a CSV file
    combined_df_original.to_csv(f'{base_path}combined_collected_df_original.csv', index=False)
    print("CSV File saved successfully: ", combined_df_original.shape)

In [6]:
all_data_csv(model_training_data_path)

Processing Files: 100%|██████████| 38/38 [00:00<00:00, 189.57it/s]

CSV File saved successfully:  (12084, 4)





# Pre-processing

### Preprocess Raw Data

- remove rows with HR values of 0
- convert GSR values to EDA

In [7]:
# Convert GSR to EDA
def calculate_eda_value(gsr_value):
    Serial_Port_Reading = gsr_value
    resistance = ((1024 + 2 * Serial_Port_Reading) * 10000) / (512 - Serial_Port_Reading)
    eda_value = (1 / resistance) * 1e6
    return eda_value

In [8]:
def preprocess_csv(base_path):

    combined_df_original = pd.read_csv(base_path)

    # Filter rows where 'HR' column is not equal to 0
    preprocessed_df = combined_df_original[combined_df_original['HR'] != 0]
    
    # Convert GSR to EDA
    preprocessed_df['EDA'] = preprocessed_df['EDA'].apply(calculate_eda_value)
    
    return preprocessed_df

In [9]:
preprocessed_df = preprocess_csv(f'{model_training_data_path}combined_collected_df_original.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_df['EDA'] = preprocessed_df['EDA'].apply(calculate_eda_value)


### Feature Extraction

In [10]:
def statistical_features(arr):
    vmin = np.amin(arr)
    vmax = np.amax(arr)
    mean = np.mean(arr)
    std = np.std(arr)
    return vmin, vmax, mean, std

def shape_features(arr):
    skewness = skew(arr)
    kurt = kurtosis(arr)
    return skewness, kurt

def calculate_rms(signal):
    diff_squared = np.square(np.ediff1d(signal))
    rms_value = np.sqrt(np.mean(diff_squared))
    return rms_value

### Extended Features

- iterating with a step size of 20
- taking 40 rows at a time to generate a single row of df_features
- find_peaks() : identify peaks in the EDA signal (eda) using the function and then count the number of detected peaks using len() function

In [13]:
def extract_features(data):
    cols = [
        'EDA_Mean', 'EDA_Min', 'EDA_Max', 'EDA_Std', 'EDA_Kurtosis', 'EDA_Skew', 'EDA_Num_Peaks', 'EDA_Amplitude', 'EDA_Duration',
        'HR_Mean', 'HR_Min', 'HR_Max', 'HR_Std', 'HR_RMS', 'TEMP_Mean', 'TEMP_Min', 'TEMP_Max', 'TEMP_Std', 'STRESS_SCORE'
    ]

    df_features = pd.DataFrame(columns=cols)
    index = 0

    for i in tqdm(range(0, len(data['EDA']), 20), desc="Processing rows", leave=True):
        
        df_partial = data.iloc[i:i+40,]
        plen = len(df_partial['EDA'])

        if plen < 40:
            continue

        eda = df_partial['EDA'].values
        hr = df_partial['HR'].values
        temp = df_partial['TEMP'].values
        stress_score = df_partial['STRESS_SCORE'].values

        eda_min, eda_max, eda_mean, eda_std = statistical_features(eda)
        hr_min, hr_max, hr_mean, hr_std = statistical_features(hr)
        temp_min, temp_max, temp_mean, temp_std = statistical_features(temp)
        stress_score_min, stress_score_max, stress_score_mean, stress_score_std = statistical_features(stress_score)
        eda_skew, eda_kurtosis = shape_features(eda)

        hr_rms = calculate_rms(hr)
        temp_rms = calculate_rms(temp)

        peaks, properties = find_peaks(eda, width=5)
        num_Peaks = len(peaks)

        prominences = np.array(properties['prominences'])
        widths = np.array(properties['widths'])
        amplitude = np.sum(prominences)
        duration = np.sum(widths)

        df_features.loc[index] = [eda_mean, eda_min, eda_max, eda_std, eda_kurtosis, eda_skew, num_Peaks, amplitude,
                                  duration, hr_mean, hr_min, hr_max, hr_std, hr_rms, temp_mean, temp_min, temp_max, temp_std, stress_score_mean]

        index = index + 1

    return df_features

In [14]:
warnings.filterwarnings("ignore", category=RuntimeWarning)
df_features = extract_features(preprocessed_df)
warnings.resetwarnings()

Processing rows:  23%|██▎       | 134/595 [00:00<00:02, 224.55it/s]

Processing rows: 100%|██████████| 595/595 [00:02<00:00, 271.06it/s]


In [15]:
print(df_features.shape)

(593, 19)


### Lag Features

In [16]:
def generate_lag_features(input_df, columns, lags):
    cols = list(map(str, range(len(columns) * len(lags), 0, -1)))
    lag_df = pd.DataFrame(columns=cols)

    index = len(columns) * len(lags)

    for col in tqdm(columns, desc="Generating lag features", leave=True):
        for lag in tqdm(lags, desc=f"Lag features for {col}", leave=True):
            lagged_column = f'{index}'
            lag_df[lagged_column] = input_df[col].shift(lag)
            index -= 1
            
    return lag_df

In [17]:
cols = ['HR_Mean', 'TEMP_Mean', 'EDA_Mean']
lags = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

df_lag_features = generate_lag_features(df_features, cols, lags)

Lag features for HR_Mean: 100%|██████████| 10/10 [00:00<00:00, 999.64it/s]
Lag features for TEMP_Mean: 100%|██████████| 10/10 [00:00<00:00, 1110.99it/s]
Lag features for EDA_Mean: 100%|██████████| 10/10 [00:00<00:00, 1111.16it/s]
Generating lag features: 100%|██████████| 3/3 [00:00<00:00, 62.50it/s]


In [18]:
print(df_lag_features.shape)

(593, 30)


In [19]:
df_total = pd.concat([df_lag_features, df_features], axis=1)

In [20]:
print(df_total.shape)

(593, 49)


### Feature Scaling

In [21]:
def scale_features(df_total, feature_columns):
    scaled_df = df_total.copy()
    scaler = MinMaxScaler()
    scaled_df[feature_columns] = scaler.fit_transform(scaled_df[feature_columns])
    scaled_df = scaled_df.fillna(0)
    return scaled_df

In [22]:
def scale_label_regression(df_total, label_column):
    scaled_df = df_total.copy()
    scaler = MinMaxScaler()
    scaled_df[label_column] = scaler.fit_transform(scaled_df[label_column].values.reshape(-1, 1))
    scaled_df = scaled_df.fillna(0)
    return scaled_df

In [23]:
def scale_label_classifier(df_total, label_column):
    df_total[label_column] = df_total[label_column].apply(lambda x: 0 if x <= 3.25 else (1 if 3.25 < x <= 6.5 else 2))
    return df_total

In [24]:
feature_cols = df_total.columns[:48]
df_total_scaled_r = scale_features(df_total, feature_cols)
df_total_scaled_r = scale_label_regression(df_total_scaled_r, 'STRESS_SCORE')

In [25]:
feature_cols = df_total.columns[:48]
df_total_scaled_c = scale_features(df_total, feature_cols)
df_total_scaled_c = scale_label_classifier(df_total_scaled_c, 'STRESS_SCORE')

Final CSV File for Training

In [26]:
df_total_scaled_r.to_csv(f'{model_training_data_path}scaled_collected_df_total_regression.csv', index=False)
df_total_scaled_c.to_csv(f'{model_training_data_path}scaled_collected_df_total_classifier.csv', index=False)

In [27]:
print(df_total_scaled_r.shape, df_total_scaled_c.shape)

(593, 49) (593, 49)


# Stress detection model

In [28]:
def load_model_from_folder(folder_path, keyword):

    all_files = os.listdir(folder_path)
    keyword_files = [file for file in all_files if keyword in file and "Retrained" not in file and file.endswith(".pkl")]

    if len(keyword_files) > 0:
        
        # Sort files based on date_time in the filename
        sorted_files = sorted(
            keyword_files,
            key=lambda x: datetime.strptime("_".join(x.split("_")[-3:]).replace(".pkl", ""), "%Y_%m_%d"),
            reverse=True
        )

        selected_file = sorted_files[0]
        file_path = os.path.join(folder_path, selected_file)

        # Load the model from the selected pkl file
        with open(file_path, 'rb') as file:
            loaded_model = joblib.load(file)

        print(f"Loaded model from {selected_file}")
        return loaded_model
    else:
        print(f"No pkl file with '{keyword}' in its name found in the folder.")
        return None

### Regression Model

In [29]:
data = pd.read_csv(f'{model_training_data_path}scaled_collected_df_total_regression.csv')

X_r = data.iloc[:,0:48] # features
Y_r = data.iloc[:,48:49] # labels

X_train_r, X_val_r, Y_train_r, Y_val_r = train_test_split(X_r, Y_r, test_size=0.33, random_state=30)

In [30]:
regressor = load_model_from_folder(model_path, keyword="Regressor")

Loaded model from Regressor_Date_Time_2024_01_15.pkl


In [31]:
regressor.fit(X_train_r, Y_train_r.values.ravel())

### Classification Model

In [32]:
data = pd.read_csv(f'{model_training_data_path}scaled_collected_df_total_classifier.csv')

X_c = data.iloc[:,0:48] # features
Y_c = data.iloc[:,48:49] # labels

X_train_c, X_val_c, Y_train_c, Y_val_c = train_test_split(X_c, Y_c, test_size=0.33, random_state=30)

In [33]:
clf = load_model_from_folder(model_path, keyword="Classifier")

Loaded model from Classifier_Date_Time_2024_01_15.pkl


In [34]:
clf.fit(X_train_c, Y_train_c.values.ravel())

### Saving Trained Models

In [35]:
date_format = '%Y_%m_%d'  # Format for extracting only the date
current_date_time_dt = dt.datetime.now()  # Current Date and Time in a DateTime Object.
current_date_string = dt.datetime.strftime(current_date_time_dt, date_format) 

In [36]:
# naming system for model
model_file_name_r = f'Retrained_Regressor_Date_Time_{current_date_string}.pkl'
model_save_path_r = model_path + model_file_name_r

# saving the model
with open(model_save_path_r, 'wb') as file:
    pickle.dump(regressor, file)

In [37]:
# naming system for model
model_file_name_c = f'Retrained_Classifier_Date_Time_{current_date_string}.pkl'
model_save_path_c = model_path + model_file_name_c

# saving the model
with open(model_save_path_c, 'wb') as file:
    pickle.dump(clf, file)

# Model Analysis

### Regression Model

In [38]:
Y_pred_r = regressor.predict(X_val_r)

In [39]:
score = regressor.score(X_val_r, Y_val_r)
print(f'R-squared Score: {score}')

R-squared Score: 0.8213688475090511


### Classifier Model

In [40]:
Y_pred_c = clf.predict(X_val_c)

In [41]:
f1score   = f1_score        (Y_val_c, Y_pred_c, average = 'macro')
recall    = recall_score    (Y_val_c, Y_pred_c, average = 'macro')
precision = precision_score (Y_val_c, Y_pred_c, average = 'macro')
accuracy  = accuracy_score  (Y_val_c, Y_pred_c)

print('acc =', accuracy)
print('pre =', precision)
print('recall =', recall) 
print('f1 =', f1score)

acc = 0.9795918367346939
pre = 0.9908045977011494
recall = 0.9547619047619048
f1 = 0.9718656136566585
