In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# load data
df = pd.read_csv("data/HR_data.csv", index_col=False)

In [36]:
# remove redundant column
df = df.drop(columns="Unnamed: 0")

# first few rows
df.head()

Unnamed: 0,HR_TD_Mean,HR_TD_Median,HR_TD_std,HR_TD_Min,HR_TD_Max,HR_TD_AUC,HR_TD_Kurtosis,HR_TD_Skew,HR_TD_Slope_min,HR_TD_Slope_max,...,upset,hostile,alert,ashamed,inspired,nervous,attentive,afraid,active,determined
0,78.663878,76.7,7.480043,67.25,92.48,23048.45,-1.091448,0.369955,-0.73,0.9775,...,1.0,1.0,2.0,1.0,2.0,2.0,3.0,1.0,2.0,2.0
1,76.540732,76.61,2.584756,69.82,82.33,23959.92,-0.245338,0.338732,-0.36,0.19,...,2.0,1.0,3.0,2.0,2.0,2.0,3.0,1.0,3.0,3.0
2,78.173563,77.92,2.681255,72.22,82.8,20324.605,-0.615922,-0.233047,-0.63,0.3575,...,1.0,1.0,2.0,1.0,3.0,2.0,3.0,2.0,3.0,3.0
3,83.073688,83.88,7.363598,69.42,96.12,24924.3,-0.86661,-0.046021,-0.465,0.65,...,1.0,1.0,2.0,1.0,3.0,2.0,3.0,2.0,3.0,3.0
4,72.28125,72.91,3.193762,64.95,79.98,23052.1,0.200401,-0.560948,-0.3725,0.3375,...,3.0,1.0,3.0,2.0,3.0,3.0,4.0,2.0,4.0,4.0


In [37]:
n_rows, n_cols = df.shape
print(f"number of rows: {n_rows}")
print(f"number of columns: {n_cols}")

# define continuous, categorical (nominal) and categorical (ordinal) feature columns
continuous_cols = df.columns[0:51]
nominal_cols = df.columns[51:53].append(df.columns[56:57])
ordinal_cols = df.columns[53:56].append(df.columns[57:])

print(f"continuous/numerical columns: {continuous_cols}")
print(f"nominal columns: {nominal_cols}")
print(f"ordinal columns: {ordinal_cols}")


number of rows: 312
number of columns: 67
continuous/numerical columns: Index(['HR_TD_Mean', 'HR_TD_Median', 'HR_TD_std', 'HR_TD_Min', 'HR_TD_Max',
       'HR_TD_AUC', 'HR_TD_Kurtosis', 'HR_TD_Skew', 'HR_TD_Slope_min',
       'HR_TD_Slope_max', 'HR_TD_Slope_mean', 'HR_TD_Slope', 'TEMP_TD_Mean',
       'TEMP_TD_Median', 'TEMP_TD_std', 'TEMP_TD_Min', 'TEMP_TD_Max',
       'TEMP_TD_AUC', 'TEMP_TD_Kurtosis', 'TEMP_TD_Skew', 'TEMP_TD_Slope_min',
       'TEMP_TD_Slope_max', 'TEMP_TD_Slope_mean', 'TEMP_TD_Slope',
       'EDA_TD_P_Mean', 'EDA_TD_P_Median', 'EDA_TD_P_std', 'EDA_TD_P_Min',
       'EDA_TD_P_Max', 'EDA_TD_P_AUC', 'EDA_TD_P_Kurtosis', 'EDA_TD_P_Skew',
       'EDA_TD_P_Slope_min', 'EDA_TD_P_Slope_max', 'EDA_TD_P_Slope_mean',
       'EDA_TD_P_Slope', 'EDA_TD_T_Mean', 'EDA_TD_T_Median', 'EDA_TD_T_std',
       'EDA_TD_T_Min', 'EDA_TD_T_Max', 'EDA_TD_T_AUC', 'EDA_TD_T_Kurtosis',
       'EDA_TD_T_Skew', 'EDA_TD_T_Slope_min', 'EDA_TD_T_Slope_max',
       'EDA_TD_T_Slope_mean', 'EDA_TD_T_S

### Standardize

In [38]:
# standardize continuous/numerical columns
mean = np.mean(df[continuous_cols], axis=0)
std = np.std(df[continuous_cols], axis=0)

df[continuous_cols] = (df[continuous_cols] - mean) / std


### Handle missing values

In [39]:
# determine number of missing values in each column
column_nulls = df.isnull().sum()

# columns with at least 1 missing value
column_nulls[column_nulls != 0]

EDA_TD_P_RT     1
EDA_TD_P_ReT    1
inspired        2
attentive       1
afraid          1
active          1
determined      2
dtype: int64

In [40]:
# replace missing values in continuous columns with column averages
df[continuous_cols] = df[continuous_cols].fillna(df[continuous_cols].mean())

# replace missing values in ordinal categorical columns with column medians
df[ordinal_cols] = df[ordinal_cols].fillna(df[ordinal_cols].median())

### Encode columns

In [41]:
# determine string valued columns
string_val_cols = df.select_dtypes(include=['object']).columns
print(string_val_cols)

Index(['Round', 'Phase', 'Cohort'], dtype='object')


In [42]:
# encode string valued columns as integers
label_encoder = LabelEncoder()

for col in string_val_cols:
    df[col] = label_encoder.fit_transform(df[col])

df.select_dtypes(include=['object']).columns

# show categorical columns after string encoding
df[df.columns[51:]].head()

Unnamed: 0,Round,Phase,Individual,Puzzler,Frustrated,Cohort,upset,hostile,alert,ashamed,inspired,nervous,attentive,afraid,active,determined
0,2,2,1,1,2.0,3,1.0,1.0,2.0,1.0,2.0,2.0,3.0,1.0,2.0,2.0
1,2,1,1,1,3.0,3,2.0,1.0,3.0,2.0,2.0,2.0,3.0,1.0,3.0,3.0
2,2,0,1,1,2.0,3,1.0,1.0,2.0,1.0,3.0,2.0,3.0,2.0,3.0,3.0
3,1,2,1,1,2.0,3,1.0,1.0,2.0,1.0,3.0,2.0,3.0,2.0,3.0,3.0
4,1,1,1,1,2.0,3,3.0,1.0,3.0,2.0,3.0,3.0,4.0,2.0,4.0,4.0


In [43]:
# one-hot encode nominal categorical feature columns
nominal_col_list = list(df[nominal_cols].columns)

df = pd.get_dummies(df, columns=nominal_col_list)

# save cleaned dataframe to csv
df.to_csv("data/HR_data_cleaned.csv", index=False)