In [1]:
# Imports 
import scipy.io
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import os

# ECG Features

In [2]:
ecg_path = '../ASCERTAIN_Features/Dt_ECGFeatures.mat'
mat = scipy.io.loadmat(ecg_path)

In [3]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[4]
participants_num = len(mat[key][0])
recording_num = len(mat[key][0][0])
features_num = len(mat[key][0][0][0])

['__header__', '__version__', '__globals__', 'ECGFailures_58', 'ECGFeatures_58']


In [4]:
ecg_feature_names = [
    "low_freq_psd_1", "low_freq_psd_2", "low_freq_psd_3", "low_freq_psd_4", "low_freq_psd_5",
    "low_freq_psd_6", "low_freq_psd_7", "low_freq_psd_8", "low_freq_psd_9","low_freq_psd_10",
    "slow_response_pds_1", "slow_response_pds_2", "slow_response_pds_3", "slow_response_pds_4",
    "ibi_1", "ibi_2", "ibi_3", "ibi_4", "ibi_5", "ibi_6",
    "hr_1", "hr_2", "hr_3", "hr_4", "hr_5", "hr_6",
    "hrv_1", "hrv_2", "hrv_3", "hrv_4", "hrv_5", "hrv_6"
]
ecg_dict = {key: [] for key in ecg_feature_names} 
ecg_participants_data = []

In [5]:
for p in range(participants_num):
    for r in range(recording_num):
        values = []
        for f in range(features_num):
            values.append(mat[key][0][p][r][f])
        ecg_participants_data.append([p, r] + values)

column_names = ["participants", "recordings"] + ecg_feature_names 
ecg_df = pd.DataFrame(ecg_participants_data, columns=column_names)

# Self Report Data

In [6]:
self_reports_path = '../ASCERTAIN_Features/Dt_SelfReports.mat'
mat = scipy.io.loadmat(self_reports_path)

In [7]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[5]
emotions_num = len(mat[key])
participants_num = len(mat[key][0])
recording_num = len(mat[key][0][0])

['__header__', '__version__', '__globals__', 'ClipNumbers', 'Length', 'Ratings']


In [8]:
mat[key].shape

(5, 58, 36)

In [9]:
emotions = ["Arousal", "Valence", "Engagement", "Liking", "Familiarity"]
emo_dict = {emo: [] for emo in emotions}
self_reports_participants_data = [emo_dict for _ in range(participants_num)]

In [10]:
self_report_data = []
for p in range(participants_num):
    for r in range(recording_num):
        values = []
        for e in range(emotions_num):
            values.append(mat[key][e][p][r])
        self_report_data.append([p, r] + values)

column_names = ["participants", "recordings"] + emotions 
self_reports_df = pd.DataFrame(self_report_data, columns=column_names)

# Personality Traits

In [11]:
personality_path = '../ASCERTAIN_Features/Dt_Personality.mat'
mat = scipy.io.loadmat(personality_path)

In [12]:
key_names = list(mat.keys())
print(key_names)
key = list(key_names)[3]
participants_num = len(mat[key])
personalities_num = len(mat[key][0])

['__header__', '__version__', '__globals__', 'Personality']


In [13]:
traits = ["Extraversion", "Agreeableness", "Conscientiousness", "Emotional Stability","Openness"]
traits_dict = {trait: [] for trait in traits}
personality_participants_data = [traits_dict for _ in range(participants_num)]

In [14]:
for par in range(participants_num):
    for per in range(personalities_num):
            personality_participants_data[par][traits[per]].append(mat[key][par][per])

In [15]:
personality_data = []
for p in range(participants_num):
    for r in range(recording_num):
        values = []
        for per in range(personalities_num):
            values.append(mat[key][p][per])
        personality_data.append([p, r] + values)

column_names = ["participants", "recordings"] + traits 
personalities_df = pd.DataFrame(personality_data, columns=column_names)

### Preprocess Personalities

In [16]:
# Describe Personality so we can bin them 
personalities_df = personalities_df.astype(float)
personalities_df.describe()

Unnamed: 0,participants,recordings,Extraversion,Agreeableness,Conscientiousness,Emotional Stability,Openness
count,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0
mean,28.5,17.5,4.306897,5.091379,5.144828,4.137931,4.953448
std,16.744679,10.390783,1.069233,0.758875,0.768562,0.909903,0.642406
min,0.0,0.0,2.6,3.2,3.4,2.0,3.7
25%,14.0,8.75,3.2,4.5,4.6,3.5,4.6
50%,28.5,17.5,4.45,5.1,5.2,4.2,4.9
75%,43.0,26.25,5.2,5.6,5.7,4.9,5.3
max,57.0,35.0,6.2,6.7,6.9,5.7,6.6


In [17]:
# Bin Personality Features into Negative, Neutral, Positive
bins = [0, 3, 5, 7] 
bin_labels = ['Negative', 'Neutral', 'Positive']
labels = ['negative', 'neutral', 'positive']

# Apply the binning to all columns
for col in personalities_df.columns:
    personalities_df[f'{col}_bin'] = pd.cut(personalities_df[col], bins=bins, labels=labels)
personalities_df = personalities_df.drop(columns=["participants_bin", "recordings_bin"])

In [18]:
# Combine self reports and ecg features
self_reports_df = self_reports_df.drop(columns=["participants", "recordings"])
personalities_df = personalities_df.drop(columns=["participants", "recordings"])
all_data_df = pd.concat([ecg_df, self_reports_df, personalities_df], axis=1)

In [19]:
# Create Folder 
data_path = "../data"
os.makedirs(data_path, exist_ok=True)
# Create csvs
all_data_df.to_csv(os.path.join(data_path, "org_data.csv"), index=False)

In [20]:
personalities_df.columns

Index(['Extraversion', 'Agreeableness', 'Conscientiousness',
       'Emotional Stability', 'Openness', 'Extraversion_bin',
       'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin',
       'Openness_bin'],
      dtype='object')

### Preprocess Features

In [21]:
# Preprocess features 
# Only keep columns which have any non-zero value and any non-NaN value.
preprocessed_features_df = all_data_df.loc[:, (all_data_df != 0).any(axis=0) & all_data_df.notna().any(axis=0)]
# Calculate a threshold which is half the number of columns 
threshold = len(preprocessed_features_df.columns) // 2 
# Remove the rows where the number of NaN values is greater than or equal to the threshold
preprocessed_features_df = preprocessed_features_df.dropna(thresh=threshold)
# Keep only those columns which have any non-zero value and any non-NaN value - Reprocess so values after rows were removed
preprocessed_features_df = preprocessed_features_df.loc[:, (preprocessed_features_df != 0).any(axis=0) & preprocessed_features_df.notna().any(axis=0)]
# Drop any rows that have any NaN values - This is strict but could be removed later on
preprocessed_features_df = preprocessed_features_df.dropna()
# Apply the MinMaxScaler to the dataframe to normalize all feature values between 0 and 1.
scaler = MinMaxScaler()
non_scaled_features = ['participants', 'recordings', 'Extraversion', 'Agreeableness',
       'Conscientiousness', 'Emotional Stability', 'Openness',
       'Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin',
       'Emotional Stability_bin', 'Openness_bin']
scaled_features_df = preprocessed_features_df.drop(columns=non_scaled_features)
scaled_features_df = pd.DataFrame(scaler.fit_transform(scaled_features_df), columns=scaled_features_df.columns)
for f in non_scaled_features:
    scaled_features_df[f] = preprocessed_features_df[f]
# Save to csv
scaled_features_df.to_csv(os.path.join(data_path, "preprocessed_data.csv"), index=False)

This removes all of the NaN and 0 columns and also any rows that contain NaNs then scales all values from 0 to 1

This removes all psd features and some ibi features. 


Could clean this up by removing all rows with NaNs then removing columns with all 0s. This method above allows use to pull out differnt parts of the process. 
