In [1]:
import os
import pandas as pd
import numpy as np

# 1. Preparation and Data Loading

### Defining the relevant column names and directories

## Split the filename to get participant ID, task number and programming or break condition

In [76]:
def extract_file_info(filename):
    split_name = filename.split('_')
    p_id = split_name[0]
    task_no = split_name[1]
    task = split_name[2].split('.')[0]

    return p_id, task_no, task

## Load and clean data

- Remove unnecessary columns
- Drop empty rows
- Filter out rows with a bad signal (hsi_precision > 2)

In [71]:
def load_and_clean_file(path_to_file, filename):
    df = pd.read_csv(path_to_file + filename)
    # Remove unnecessary columns and drop empty rows
    df_clean = df[[col for col in cols]].dropna(thresh=2).reset_index(drop=True)
    # Group every 5 rows together to one row
    df_clean = df_clean.groupby(np.arange(len(df_clean)) // 5).agg({'timestamps':'mean',
                                                                'theta_absolute_1':'mean',
                                                                'theta_absolute_2':'mean',
                                                                'theta_absolute_3':'mean',
                                                                'theta_absolute_4':'mean',
                                                                'alpha_absolute_1': 'mean',
                                                                'alpha_absolute_2': 'mean',
                                                                'alpha_absolute_3': 'mean',
                                                                'alpha_absolute_4': 'mean',
                                                                'beta_absolute_1': 'mean',
                                                                'beta_absolute_2': 'mean',
                                                                'beta_absolute_3': 'mean',
                                                                'beta_absolute_4': 'mean',
                                                                'blink': 'mean',
                                                                'hsi_precision_1': 'mean',
                                                                'hsi_precision_2': 'mean',
                                                                'hsi_precision_3': 'mean',
                                                                'hsi_precision_4': 'mean'
                                                               })
    # remove rows with bad signal
    df_good_signal = df_clean[(df_clean.hsi_precision_1 < 3) & (df_clean.hsi_precision_2 < 3) & 
                          (df_clean.hsi_precision_3 < 3) & (df_clean.hsi_precision_4 < 3)]
    
    return df_good_signal[['timestamps', 'theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3',
                           'theta_absolute_4', 'alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 
                           'alpha_absolute_4', 'beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3',
                           'beta_absolute_4', 'blink']]

## Compute baseline

In [56]:
def filter_out_2nd_min(clean_df_break):
    
    # Find timestamps for last minute of recording
    begin_last_min = max(clean_df_break.timestamps) - 60
    # find index of first instance within that minute
    begin_idx = (clean_df_break.timestamps.values >= begin_last_min).argmax()
    # Split dataframe
    df_min2 = clean_df_break.iloc[begin_idx:].reset_index(drop=True)
    
    return df_min2

In [58]:
def compute_blink_per_sec(clean_df):
    # blinkrate per sec
    df_blink = clean_df[['timestamps', 'blink']]
    df_blink['timestamps'] = df_blink['timestamps'].astype(int)
    df_bps = df_blink.groupby(by="timestamps").aggregate('mean')
    
    return df_bps

def compute_blink_baseline(df_bps):
    return np.mean(df_bps['blink'])

In [59]:
def compute_eeg_baselines(clean_df):
    # Baseline: Mean of 2nd minute

    baseline_freq_list = [clean_df.theta_absolute_1.mean(), clean_df.theta_absolute_2.mean(),
                          clean_df.theta_absolute_3.mean(), clean_df.theta_absolute_4.mean(),
                          clean_df.alpha_absolute_1.mean(), clean_df.alpha_absolute_2.mean(),
                          clean_df.alpha_absolute_3.mean(), clean_df.alpha_absolute_4.mean(),
                          clean_df.beta_absolute_1.mean(), clean_df.beta_absolute_2.mean(),
                          clean_df.beta_absolute_3.mean(), clean_df.beta_absolute_4.mean()] 

    baseline_df = pd.DataFrame([baseline_freq_list], columns=["theta_absolute_1", "theta_absolute_2",
                                                              "theta_absolute_3", "theta_absolute_4",
                                                              "alpha_absolute_1", "alpha_absolute_2",
                                                              "alpha_absolute_3", "alpha_absolute_4",
                                                              "beta_absolute_1", "beta_absolute_2",
                                                              "beta_absolute_3", "beta_absolute_4"])
    baseline_df['blink'] = baseline_bps
    
    return baseline_df


In [60]:
def load_music_map(filename):
    music_conditions = pd.read_csv(filename, sep=';')
    music_conditions = music_conditions[['ID', 'Music 1', 'Music 2', 'Music 3', 'Music 4']]
    return music_conditions

## Filenames, IDs, Columnnames, etc

In [61]:
p_ids = ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
task_numbers = [1, 2, 3, 4]
music_mapping_file = 'music_lookup.csv'

In [62]:
directory = 'data/'

data_files = os.listdir(directory)
cols = ['timestamps', 'theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3', 'theta_absolute_4',
            'alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 'alpha_absolute_4',
            'beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3', 'beta_absolute_4',
            'blink','hsi_precision_1', 'hsi_precision_2', 'hsi_precision_3', 'hsi_precision_4']

------
# Trial and error starts here
-----

In [144]:
music_file = load_music_map(music_mapping_file)

In [80]:
p_id, task, condition = extract_file_info(data_files[2])
base_file = p_id + '_' + task + '_break.csv'

In [79]:
data_files[2]

'P07_4_programming.csv'

In [72]:
cleaned_data = load_and_clean_file(directory, data_files[2])
cleaned_data.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink
0,1574065000.0,1.35237,0.438221,0.632502,0.804571,1.06106,0.895723,0.710494,0.435289,0.779122,0.307681,-0.053585,0.663335,0.0
1,1574065000.0,1.35237,0.527081,0.753769,0.804571,1.06106,1.05203,0.747792,0.435289,0.779122,0.343166,-0.074242,0.663335,0.0
2,1574065000.0,1.35237,0.616186,0.790176,0.804571,1.06106,1.07557,0.747792,0.435289,0.779122,0.373486,-0.095826,0.663335,0.0
3,1574065000.0,1.35237,0.679825,0.828057,0.804571,1.06106,1.07424,0.747792,0.435289,0.779122,0.39545,-0.104782,0.663335,0.0
4,1574065000.0,1.35237,0.712005,0.879289,0.804571,1.06106,1.07424,0.747792,0.435289,0.779122,0.411453,-0.103389,0.663335,0.0


In [121]:
# get 1 sec averages
cleaned_data['timestamps'] = cleaned_data['timestamps'].astype(int)
grouped_df = cleaned_data.groupby(by='timestamps', as_index=False).aggregate('mean')
print(len(cleaned_data))
print(len(grouped_df))
grouped_df.head()

1687
189


Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink
0,1574064671,1.35237,0.615612,0.788532,0.804571,1.06106,1.041007,0.741576,0.435289,0.779122,0.375644,-0.089863,0.663335,0.0
1,1574064672,1.35237,0.747208,0.688172,0.804571,1.06106,1.114064,0.602192,0.435289,0.779122,0.438282,-0.143715,0.663335,0.0
2,1574064673,1.35237,0.87689,0.585336,0.804571,1.06106,1.128393,0.48727,0.435289,0.779122,0.401729,-0.030225,0.663335,0.1
3,1574064674,1.35237,0.84,0.614084,0.804571,1.06106,0.570269,0.534548,0.435289,0.779122,0.310518,0.195827,0.663335,0.1
4,1574064675,1.116601,0.616888,0.607448,0.812923,0.916793,0.714204,0.555805,0.592358,0.661254,0.385658,0.127086,0.699925,0.111111


In [102]:
# get baselines
baseline_df = load_and_clean_file(directory, base_file)
baseline_df = filter_out_2nd_min(baseline_df)
base_bps = compute_blink_per_sec(baseline_df)
base_bps_avg = compute_blink_baseline(base_bps)
eeg_base = compute_eeg_baselines(baseline_df)

  if (yield from self.run_code(code, result)):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [122]:
# normalize blink
grouped_df['blink'] = grouped_df['blink'] - base_bps_avg
# normalize eeg
for col in cols[1:13]:
    grouped_df[col] = grouped_df[col] - eeg_base[col].values[0]
grouped_df.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink
0,1574064671,1.18585,0.449735,0.239923,0.527409,0.701907,0.414628,0.029672,0.0762,0.337585,-0.266929,-0.105292,0.35835,-0.370271
1,1574064672,1.18585,0.581331,0.139563,0.527409,0.701907,0.487685,-0.109711,0.0762,0.337585,-0.204292,-0.159144,0.35835,-0.370271
2,1574064673,1.18585,0.711013,0.036726,0.527409,0.701907,0.502014,-0.224633,0.0762,0.337585,-0.240845,-0.045654,0.35835,-0.270271
3,1574064674,1.18585,0.674123,0.065475,0.527409,0.701907,-0.05611,-0.177355,0.0762,0.337585,-0.332056,0.180398,0.35835,-0.270271
4,1574064675,0.950081,0.451012,0.058839,0.53576,0.55764,0.087825,-0.156098,0.233269,0.219717,-0.256916,0.111658,0.39494,-0.25916


In [123]:
# get avg for 10 secs interval -> remove last digit and group by timestamp?
grouped_df['timestamps'] = grouped_df['timestamps'].astype(str).str[:-1].astype(np.int64)
df_10_secs = grouped_df.groupby(by='timestamps', as_index=False).aggregate('mean')
df_10_secs.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink
0,157406467,0.720681,0.431951,0.031632,0.292153,0.427466,0.194793,-0.193971,0.101822,0.138997,-0.249827,-0.063554,0.230811,-0.335703
1,157406468,0.26185,0.247292,-0.38027,0.136197,0.199157,-0.032955,-0.345765,0.19828,-0.058063,-0.134201,-0.035773,0.258922,-0.102175
2,157406469,0.367342,-0.68812,-0.274891,0.406059,0.437173,-0.597245,-0.341603,0.54472,0.153865,-0.895303,-0.106004,0.403867,-0.266104
3,157406470,0.053519,-0.976816,-0.452674,-0.094637,0.275062,-0.838066,-0.453224,0.069626,0.040091,-0.942971,0.073534,0.160761,-0.285548
4,157406471,0.132851,-0.560596,-0.199114,0.011863,0.107965,-0.40438,-0.384434,0.100682,0.125938,-0.634509,0.197419,0.244345,0.146793


In [128]:
# avg values of sensors and compute tdi and tei
theta = df_10_secs[['theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3', 'theta_absolute_4']].mean(axis=1)
alpha = df_10_secs[['alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 'alpha_absolute_4']].mean(axis=1)
beta = df_10_secs[['beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3', 'beta_absolute_4']].mean(axis=1)


In [132]:
tei = beta / (alpha + theta)
tdi = theta / (alpha + beta)
tdi

0      2.517179
1      5.344000
2      0.473396
3      0.910488
4      0.950573
5      0.349503
6     20.849705
7      0.267186
8      0.496236
9      0.453147
10     0.369418
11     0.345805
12     0.339806
13     1.131392
14    -0.114389
15     0.265920
16     0.217990
17     0.349591
18     0.611575
19     0.664212
20     0.572514
21     0.015821
22     0.786971
23     0.009509
24     0.313500
dtype: float64

In [142]:
[p_id, task] + tei.values.tolist()

['P07',
 '4',
 0.028121452524259478,
 0.10882928141037088,
 3.026472272548312,
 0.2765935123576415,
 0.0558980210569833,
 0.1381358189192029,
 -0.2734551285656509,
 0.5798964095249148,
 0.47884548280379663,
 0.5221250336350135,
 0.5959069083695043,
 1.4128447626234617,
 2.303665866231585,
 0.4021822749735704,
 -5.2690362588939275,
 1.0681925401677879,
 1.563050591930084,
 5.934926875464973,
 0.9048769643437012,
 0.5766867930448845,
 0.7908011879777967,
 34.3930150658211,
 6.282431465954441,
 2.3235195204951276,
 0.9625434622504855]

In [143]:
df_cols = ['p_id', 'programming_task', 'music_cond'] + ['intervall_' + str(i) for i in range(len(tei))]
tei_frame_prog = pd.DataFrame([[p_id, task] + tei.values.tolist()], 
                              columns=df_cols)
tei_frame_prog

Unnamed: 0,p_id,programming_task,intervall_0,intervall_1,intervall_2,intervall_3,intervall_4,intervall_5,intervall_6,intervall_7,...,intervall_15,intervall_16,intervall_17,intervall_18,intervall_19,intervall_20,intervall_21,intervall_22,intervall_23,intervall_24
0,P07,4,0.028121,0.108829,3.026472,0.276594,0.055898,0.138136,-0.273455,0.579896,...,1.068193,1.563051,5.934927,0.904877,0.576687,0.790801,34.393015,6.282431,2.32352,0.962543


In [None]:
tdi_frame_prog = 

In [131]:
print(len(alpha))
print(len(tdi))
print(len(df_10_secs))

25
25
25


---
# Renaming of files after download from SwitchDrive
---

In [81]:
from pathlib import Path
from os import rename, rmdir
import re
from zipfile import ZipFile
import pandas as pd


# Works with Luka's naming scheme
def unzip():
    for filename in Path('data').rglob('*.zip'):
        parts = filename.parts
        with ZipFile(filename, 'r') as zipObj:
            zipObj.extractall('tmp')
        name = parts[-1]
        if re.search(r'.*break.*',name):
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'break\1_condition\2_\3.csv',name)
        else:
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'programming\1_condition\2_\3.csv',name)

        for museFile in Path('tmp').rglob('*'):
            p = museFile.parts
            rename(museFile,'data/' + name)
    rmdir('tmp')
    

# Replaces the 'XxX' placeholder with condition number e.g. focus music = 1, office = 4 ...
def addConditions(filename, participant_id):
    conditionOrder = re.search(r'.*([0-9])_.*',filename)
    conditionOrder = int(conditionOrder.group(1))
    order = getMusicOrderList()[int(participant_id) - 1] # -1 ==> the ordering of lists starts with 0
    return re.sub(r'XxX',str(order[conditionOrder - 1]),filename)
    
    

# Normalizing the naming scheme of files from Kathrin
def renameKathrin():
    for filename in Path('data').rglob('*.csv'):
        name = filename.parts[-1]
        if re.search(r'P.*',name):
            name = re.sub(r'^P([0-9]{2})_([0-9])_(.*)\.csv$', r'\3\2_conditionXxX_ID\1.csv',name)
            rename(filename,'data/'+name)
            
            # Normalize the ID numbering
            participant_id = re.search(r'.*ID([0-9][0-9]?)',name)
            participant_id = participant_id.group(1)
            participant_id = int(participant_id)            
            participant_id = str(participant_id)
            newName = re.sub(r'ID[0-9]{2}',"ID"+participant_id,name)
            
            newName = addConditions(newName, participant_id) # adds condition into the naming scheme
            rename('data/'+name,'data/' + newName)
            
    
            
# P08_1_break.csv
            
            
def getMusicOrderList():
    musicOrderList = []
    musicOrder_df = pd.read_csv('music_lookup.csv',sep=';')
    musicOrder_df.dropna(axis=0, thresh=3, inplace=True) # Tresh = 3 ==> rows/cols have 2 NA
    musicOrder_df.dropna(axis=1, thresh=3, inplace=True)

    for i in range(musicOrder_df.shape[0]):
        row = musicOrder_df.iloc[i]
        row = list(row)
        musicOrderList.append([int(condition_num) for condition_num in row[1:5]]) # '1:5' because we have 4 conditions
    return musicOrderList
        

