In [125]:
import pandas as pd
import os
import datetime
import pytz
import numpy as np
import pickle

In [126]:
# read the txt readme files

demo=pd.DataFrame()

numbers=[2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]

for i in numbers:
    filename = 'data\WESAD\S%i\S%i_readme.txt'%(i,i)

#filename = 'data\WESAD\S8\S8_readme.txt'

    with open(filename, 'r') as file:
        lines = file.readlines()

    subject_number = int(os.path.basename(filename).split('S')[1].split('_')[0])
    data = {'id': subject_number}
    current_header = None

    for line in lines:
        parts = line.strip().split(':')  # Split using ':' as delimiter
        if len(parts) == 1:  # If ':' is not found, split using '?' as delimiter
            parts = line.strip().split('?')
        if len(parts) > 1:
            header = parts[0].strip()  # Extract the column header
            value = ':'.join(parts[1:]).strip()  # Join remaining parts as value
            data[header] = value

    # Create a Pandas DataFrame
    user_df = pd.DataFrame([data])
    # before the next user, add this user_df to the total dataframe
    demo = pd.concat([demo, user_df])

demo = demo.drop(columns=['Stress condition / TSST interview part'])
demo

demo.to_pickle('data\demo.pkl')

In [127]:
# read the PANAS self questionnaires csv files

# PANAS scores can range from 10 to 50, with higher scores representing higher levels of positive or negative affect, respectively.

def calculate_panas_scores_from_dataframe(df):

    positive_headers = ["Interested", "Inspired", "Guilty", "Excited", "Irritable", "Alert", "Attentive", "Jittery", "Afraid", "Stressed"]
    negative_headers = ["Active", "Distressed", "Annoyed", "Strong", "Scared", "Hostile", "Proud", "Enthusiastic", "Ashamed", "Sad", "(Angry)", "(Irritated)"]

    df[positive_headers + negative_headers] = df[positive_headers + negative_headers].apply(pd.to_numeric, errors='coerce').fillna(0)

    pa_score = df[positive_headers].astype(int).sum(axis=1)
    na_score = df[negative_headers].astype(int).sum(axis=1)

    df['PA_Score'] = pa_score
    df['NA_Score'] = na_score

    return df

numbers=[2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]

panas=pd.DataFrame()

for user in numbers:
    filename = 'data\WESAD\S%i\S%i_quest.csv'%(i,i)
    #user_df=pd.DataFrame()

    lines_to_read = 5
    data = []

    header_mapping = {
    1: "Active",
    2: "Distressed",
    3: "Interested",
    4: "Inspired",
    5: "Annoyed",
    6: "Strong",
    7: "Guilty",
    8: "Scared",
    9: "Hostile",
    10: "Excited",
    11: "Proud",
    12: "Irritable",
    13: "Enthusiastic",
    14: "Ashamed",
    15: "Alert",
    16: "Nervous",
    17: "Determined",
    18: "Attentive",
    19: "Jittery",
    20: "Afraid",
    21: "Stressed",
    22: "Frustrated",
    23: "Happy",
    24: "Sad",
    25: "(Angry)",
    26: "(Irritated)"
}

    with open(filename, 'r') as csvfile:
        lines_read = 0
        for line in csvfile:
            if lines_read >= lines_to_read:
                break
            if line.strip().startswith('# PANAS'):
                values = line.strip().split(';')[1:]
                data.append(values)
                lines_read += 1

    user_df = pd.DataFrame(data, columns=[header_mapping[i] for i in range(1, len(data[0]) + 1)])
    user_df['id'] = user

    # before the next user, add this user_df to the total dataframe
    panas = pd.concat([panas, user_df])

panas = calculate_panas_scores_from_dataframe(panas)
panas = panas.drop(panas.iloc[:, 0:26],axis = 1)

panas
panas.to_pickle('data\panas.pkl')

In [128]:
# read the STAI self questionnaires csv files

# to calculate the total stress score simply sum per row

numbers=[2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]

stai=pd.DataFrame()

for user in numbers:
    filename = 'data\WESAD\S%i\S%i_quest.csv'%(user,user)

    lines_to_read = 5
    data = []

    with open(filename, 'r') as csvfile:
        lines_read = 0
        for line in csvfile:
            if lines_read >= lines_to_read:
                break
            if line.strip().startswith('# STAI'):
                values = line.strip().split(';')[1:]
                data.append(values)
                lines_read += 1

    user_df = pd.DataFrame(data)
    user_df = user_df.iloc[:, 0:6]
    user_df['id'] = user

    # before the next user, add this user_df to the total dataframe
    stai = pd.concat([stai, user_df])

for column in stai.columns:
    stai[column] = pd.to_numeric(stai[column], errors='coerce')


stai['stai_stress'] = stai.iloc[:, 0:6].sum(axis=1)
stai = stai.iloc[:, 6:]

mean_stai = stai['stai_stress'].mean()
std_stai = stai['stai_stress'].std()

def get_stai_category(score):
    if score < mean_stai-0.5*std_stai:
        return "Below average"
    if score > mean_stai+0.5*std_stai:
        return "Above average"
    return "Average"

stai['stai_stress_category'] = stai['stai_stress'].apply(lambda score: get_stai_category(score))

stai = stai.drop(columns=['stai_stress'])

stai

stai.to_pickle('data\stai.pkl')

In [129]:
def adds_datetime_col_acc(df):
    timestamp_utc = df.iloc[0, 0]
    datetime_utc = datetime.datetime.utcfromtimestamp(timestamp_utc)
    datetime_utc = datetime_utc.replace(tzinfo=pytz.utc)
    target_timezone = pytz.timezone('America/New_York')
    datetime_local = datetime_utc.astimezone(target_timezone)
    df['Date'] = datetime_local.strftime('%Y-%m-%d %H:%M:%S %Z')
    df = df.iloc[1:]
    starting_date = df.iloc[0,3]
    num_rows = df.shape[0]
    date_range = pd.date_range(start=starting_date, periods=num_rows, freq='S')
    df['date'] = pd.DataFrame({'date': date_range})
    df = df.drop(columns="Date")

    return df

def adds_datetime_col(df):
    timestamp_utc = df.iloc[0, 0]
    datetime_utc = datetime.datetime.utcfromtimestamp(timestamp_utc)
    datetime_utc = datetime_utc.replace(tzinfo=pytz.utc)
    target_timezone = pytz.timezone('America/New_York')
    datetime_local = datetime_utc.astimezone(target_timezone)
    df['Date'] = datetime_local.strftime('%Y-%m-%d %H:%M:%S %Z')
    df = df.iloc[1:]
    starting_date = df.iloc[0,1]
    num_rows = df.shape[0]
    date_range = pd.date_range(start=starting_date, periods=num_rows, freq='S')
    df['date'] = pd.DataFrame({'date': date_range})
    df = df.drop(columns="Date")

    return df

In [130]:
# read E4 data

numbers=[2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]

total_df = pd.DataFrame()

for user in numbers:
    e4_acc = pd.read_csv('data\WESAD\S%i\S%i_E4_Data\ACC.csv'%(user,user), header=None)
    e4_acc = adds_datetime_col_acc(e4_acc)

    e4_eda = pd.read_csv('data\WESAD\S%i\S%i_E4_Data\EDA.csv'%(user,user), header=None)
    e4_eda = e4_eda.rename(columns={e4_eda.columns[0]: 'EDA'})
    e4_eda = adds_datetime_col(e4_eda)
    e4_eda['id'] = user
    e4_eda =e4_eda.groupby(['id','date'])['EDA'].mean().reset_index()
    data = pd.merge(e4_acc, e4_eda, how='outer', on='date')

    e4_bvp = pd.read_csv('data\WESAD\S%i\S%i_E4_Data\BVP.csv'%(user,user), header=None)
    e4_bvp = e4_bvp.rename(columns={e4_bvp.columns[0]: 'BVP'})
    e4_bvp = adds_datetime_col(e4_bvp)
    data = pd.merge(data, e4_bvp, how='outer', on='date')

    e4_hr = pd.read_csv('data\WESAD\S%i\S%i_E4_Data\HR.csv'%(user,user), header=None)
    e4_hr = e4_hr.rename(columns={e4_hr.columns[0]: 'HR'})
    e4_hr = adds_datetime_col(e4_hr)
    data = pd.merge(data, e4_hr, how='outer', on='date')

    e4_temp = pd.read_csv('data\WESAD\S%i\S%i_E4_Data\TEMP.csv'%(user,user), header=None)
    e4_temp = e4_temp.rename(columns={e4_temp.columns[0]: 'TEMP'})
    e4_temp = adds_datetime_col(e4_temp)
    data = pd.merge(data, e4_temp, how='outer', on='date')

    data['id'] = user

    # before the next user, add this user_df to the total dataframe
    total_df = pd.concat([total_df, data])

total_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.DataFrame({'date': date_range})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.DataFrame({'date': date_range})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.DataFrame({'date': date_range})
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,0,1,2,date,id,EDA,BVP,HR,TEMP
0,32.0,32.0,32.0,2017-05-22 03:15:26,2,4.000000,64.00,,4.00
1,29.0,-6.0,55.0,2017-05-22 03:15:27,2,0.000000,-0.00,,382.18
2,28.0,-6.0,55.0,2017-05-22 03:15:28,2,0.328021,-0.00,,382.18
3,28.0,-6.0,55.0,2017-05-22 03:15:29,2,0.410026,-0.00,,382.18
4,28.0,-6.0,55.0,2017-05-22 03:15:30,2,0.433090,-0.00,,382.18
...,...,...,...,...,...,...,...,...,...
462821,,,,2017-08-16 11:53:03,17,,-53.77,,
462822,,,,2017-08-16 11:53:04,17,,-53.29,,
462823,,,,2017-08-16 11:53:05,17,,-52.71,,
462824,,,,2017-08-16 11:53:06,17,,-52.26,,


In [131]:
# Replace NaN values with column's median for continuous features
columns = [[0, 1, 2, 'EDA', 'BVP', 'HR', 'TEMP']]
numbers=[2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]

data = pd.DataFrame()

for user in numbers:
    specifc_user = total_df[total_df["id"] == user]
    for col in columns:
        specifc_user[col] = specifc_user[col].apply(pd.to_numeric, errors='coerce')
        specifc_user[col] = specifc_user[col].fillna(specifc_user[col].median())

        data = pd.concat([data, specifc_user])

total_df = data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specifc_user[col] = specifc_user[col].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specifc_user[col] = specifc_user[col].fillna(specifc_user[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specifc_user[col] = specifc_user[col].apply(pd.to_numeri

In [132]:
total_df = total_df[['id', 'date', 0, 1, 2, 'EDA', 'BVP', 'HR', 'TEMP']]
total_df = total_df.sort_values(by='date',ascending=True)
total_df['date'] = pd.to_datetime(total_df['date'])

total_df["date"] = pd.to_datetime(total_df["date"], infer_datetime_format=True)
total_df["hour"] = total_df["date"].dt.hour
total_df["date"] = pd.to_datetime(total_df["date"].dt.date, infer_datetime_format=True)

df = total_df.groupby(['id', 'date', 'hour']).mean()
df.reset_index(drop=False, inplace=True)

df['datetime'] = pd.to_datetime(df.date) + pd.to_timedelta(df.hour, unit='h')
df = df[['id', 'datetime', 'hour', 0, 1, 2, 'EDA', 'BVP', 'HR', 'TEMP']]
df

Unnamed: 0,id,datetime,hour,0,1,2,EDA,BVP,HR,TEMP
0,2,2017-05-22 03:00:00,3.0,33.194839,-20.779731,27.586761,0.979370,-2.051485,78.873504,35.861821
1,2,2017-05-22 04:00:00,4.0,41.321111,-30.715278,-0.680556,1.349422,2.302322,74.118131,35.411183
2,2,2017-05-22 05:00:00,5.0,42.177500,-44.416667,-1.042778,0.668645,-1.572761,72.132161,35.776811
3,2,2017-05-22 06:00:00,6.0,30.669167,-44.639167,13.186111,0.138875,0.819792,73.100000,35.577078
4,2,2017-05-22 07:00:00,7.0,30.708889,-29.134167,0.261667,0.583789,0.028806,73.100000,34.218989
...,...,...,...,...,...,...,...,...,...,...
1909,17,2017-08-16 07:00:00,7.0,-46.000000,-2.000000,11.000000,1.038622,0.110508,71.680000,32.830000
1910,17,2017-08-16 08:00:00,8.0,-46.000000,-2.000000,11.000000,1.038622,0.038244,71.680000,32.830000
1911,17,2017-08-16 09:00:00,9.0,-46.000000,-2.000000,11.000000,1.038622,0.009639,71.680000,32.830000
1912,17,2017-08-16 10:00:00,10.0,-46.000000,-2.000000,11.000000,1.038622,-0.300889,71.680000,32.830000


In [133]:
df.to_pickle('data\empatica.pkl')