In [45]:
# Imports 
import scipy.io
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import os

# ECG Features

In [46]:
ecg_path = '../ASCERTAIN_Features/Dt_ECGFeatures.mat'
mat = scipy.io.loadmat(ecg_path)

In [47]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[4]
participants_num = len(mat[key][0])
recording_num = len(mat[key][0][0])
features_num = len(mat[key][0][0][0])

['__header__', '__version__', '__globals__', 'ECGFailures_58', 'ECGFeatures_58']


In [48]:
ecg_feature_names = [
    "low_freq_psd_1", "low_freq_psd_2", "low_freq_psd_3", "low_freq_psd_4", "low_freq_psd_5",
    "low_freq_psd_6", "low_freq_psd_7", "low_freq_psd_8", "low_freq_psd_9","low_freq_psd_10",
    "slow_response_pds_1", "slow_response_pds_2", "slow_response_pds_3", "slow_response_pds_4",
    "ibi_1", "ibi_2", "ibi_3", "ibi_4", "ibi_5", "ibi_6",
    "hr_1", "hr_2", "hr_3", "hr_4", "hr_5", "hr_6",
    "hrv_1", "hrv_2", "hrv_3", "hrv_4", "hrv_5", "hrv_6"
]
ecg_dict = {key: [] for key in ecg_feature_names} 
ecg_participants_data = [ecg_dict for _ in range(participants_num)]

In [49]:
for p in range(participants_num):
    for r in range(recording_num):
        for f in range(features_num):
            ecg_participants_data[p][ecg_feature_names[f]].append(mat[key][0][p][r][f])

# Self Report Data

In [50]:
self_reports_path = '../ASCERTAIN_Features/Dt_SelfReports.mat'
mat = scipy.io.loadmat(self_reports_path)

In [51]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[5]
emotions_num = len(mat[key])
participants_num = len(mat[key][0])
recording_num = len(mat[key][0][0])

['__header__', '__version__', '__globals__', 'ClipNumbers', 'Length', 'Ratings']


In [52]:
emotions = ["Arousal", "Valence", "Engagement", "Liking", "Familiarity"]
emo_dict = {emo: [] for emo in emotions}
self_reports_participants_data = [emo_dict for _ in range(participants_num)]

In [53]:
features = []
for e in range(emotions_num):
    for p in range(participants_num):
        for r in range(recording_num):
            self_reports_participants_data[p][emotions[e]].append(mat[key][e][p][r])

# Personality Traits

In [54]:
personality_path = '../ASCERTAIN_Features/Dt_Personality.mat'
mat = scipy.io.loadmat(personality_path)

In [55]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[3]
participants_num = len(mat[key])
personalities_num = len(mat[key][0])

['__header__', '__version__', '__globals__', 'Personality']


In [56]:
traits = ["Extraversion", "Agreeableness", "Conscientiousness", "Emotional Stability","Openness"]
traits_dict = {trait: [] for trait in traits}
personality_participants_data = [traits_dict for _ in range(participants_num)]

In [57]:
for par in range(participants_num):
    for per in range(personalities_num):
            personality_participants_data[par][traits[per]].append(mat[key][par][per])

# Format into dataframes

In [58]:
# ECG 
ecg_df = pd.DataFrame(ecg_participants_data)
ecg_df['participants'] = ecg_df.index
ecg_df = ecg_df.apply(lambda col: col.explode()).assign(users=lambda df: df.index)[['participants'] + ecg_feature_names]

In [59]:
# self reports 
self_reports_df = pd.DataFrame(self_reports_participants_data)
self_reports_df = self_reports_df.apply(lambda col: col.explode())
self_reports_df['participants'] = self_reports_df.index
self_reports_df = self_reports_df[['participants'] + emotions]

In [60]:
# personalities 
personalities_df = pd.DataFrame(personality_participants_data)
personalities_df = personalities_df.apply(lambda col: col.explode())
personalities_df['participants'] = personalities_df.index
personalities_df = personalities_df[['participants'] + traits]

In [61]:
# Combine self reports and ecg features
self_reports_df = self_reports_df.drop(columns=['participants'])
features_df = pd.concat([ecg_df, self_reports_df], axis=1)

In [62]:
# Create Folder 
data_path = "../data"
os.makedirs(data_path, exist_ok=True)
# Create csvs
features_df.to_csv(os.path.join(data_path, "features.csv"), index=False)
personalities_df.to_csv(os.path.join(data_path, "personalities.csv"), index=False)

In [63]:
# Preprocess features 
# Only keep columns which have any non-zero value and any non-NaN value.
preprocessed_features_df = features_df.loc[:, (features_df != 0).any(axis=0) & features_df.notna().any(axis=0)]
# Calculate a threshold which is half the number of columns 
threshold = len(preprocessed_features_df.columns) // 2 
# Remove the rows where the number of NaN values is greater than or equal to the threshold
preprocessed_features_df = preprocessed_features_df.dropna(thresh=threshold)
# Keep only those columns which have any non-zero value and any non-NaN value - Reprocess so values after rows were removed
preprocessed_features_df = preprocessed_features_df.loc[:, (preprocessed_features_df != 0).any(axis=0) & preprocessed_features_df.notna().any(axis=0)]
# Drop any rows that have any NaN values - This is strict but could be removed later on
preprocessed_features_df = preprocessed_features_df.dropna()
scaler = MinMaxScaler()
# Apply the MinMaxScaler to the dataframe to normalize all feature values between 0 and 1.
preprocessed_features_df = pd.DataFrame(scaler.fit_transform(preprocessed_features_df), columns=preprocessed_features_df.columns)


# Save to csv
preprocessed_features_df.to_csv(os.path.join(data_path, "preprocessed_features.csv"), index=False)

This removes all of the NaN and 0 columns and also any rows that contain NaNs then scales all values from 0 to 1

This removes all psd features and some ibi features. 


Could clean this up by removing all rows with NaNs then removing columns with all 0s. This method above allows use to pull out differnt parts of the process. 
