In [1]:
import os
import pandas as pd
import numpy as np

# 1. Preparation and Data Loading

### Defining the relevant column names and directories

In [5]:
p_ids = ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
task_numbers = [1, 2, 3, 4]

In [2]:
directory = 'data/'

data_files = os.listdir(directory)
cols = ['timestamps', 'theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3', 'theta_absolute_4',
            'alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 'alpha_absolute_4',
            'beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3', 'beta_absolute_4',
            'blink','hsi_precision_1', 'hsi_precision_2', 'hsi_precision_3', 'hsi_precision_4']

## Split the filename to get participant ID, task number and programming or break condition

In [3]:
def extract_file_info(filename):
    split_name = filename.split('_')
    p_id = split_name[0]
    task_no = split_name[1]
    task = split_name[2].split('.')[0]

    return p_id, task_no, task

## Load and clean data

- Remove unnecessary columns
- Drop empty rows
- Filter out rows with a bad signal (hsi_precision > 2

In [None]:
def load_and_clean_file(path_to_file, filename):
    df = pd.read_csv(path_to_file + filename)
    # Remove unnecessary columns and drop empty rows
    df_clean = df[[col for col in cols]].dropna(thresh=2).reset_index(drop=True)
    # Group every 5 rows together to one row
    df_clean = df_clean.groupby(np.arange(len(df_clean)) // 5).agg({'timestamps':'mean',
                                                                'theta_absolute_1':'mean',
                                                                'theta_absolute_2':'mean',
                                                                'theta_absolute_3':'mean',
                                                                'theta_absolute_4':'mean',
                                                                'alpha_absolute_1': 'mean',
                                                                'alpha_absolute_2': 'mean',
                                                                'alpha_absolute_3': 'mean',
                                                                'alpha_absolute_4': 'mean',
                                                                'beta_absolute_1': 'mean',
                                                                'beta_absolute_2': 'mean',
                                                                'beta_absolute_3': 'mean',
                                                                'beta_absolute_4': 'mean',
                                                                'blink': 'mean',
                                                                'hsi_precision_1': 'mean',
                                                                'hsi_precision_2': 'mean',
                                                                'hsi_precision_3': 'mean',
                                                                'hsi_precision_4': 'mean'
                                                               })
    # remove rows with bad signal
    df_good_signal = df_clean[(df_clean.hsi_precision_1 < 3) & (df_clean.hsi_precision_2 < 3) & 
                          (df_clean.hsi_precision_3 < 3) & (df_clean.hsi_precision_4 < 3)]
    
    return df_good_signal

## Remove rows with a bad signal (hsi_precision > 2)

In [10]:
df_good_signal = df_clean[(df_clean.hsi_precision_1 < 3) & (df_clean.hsi_precision_2 < 3) & 
                          (df_clean.hsi_precision_3 < 3) & (df_clean.hsi_precision_4 < 3)]
df_good_signal.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink,hsi_precision_1,hsi_precision_2,hsi_precision_3,hsi_precision_4
0,1574063000.0,0.392669,0.341194,0.575521,0.64898,0.662823,0.426493,0.702633,0.323175,0.314401,0.269253,0.094549,0.525343,0.0,2.0,1.0,1.0,2.0
1,1574063000.0,0.392669,0.365021,0.620934,0.64898,0.662823,0.410632,0.698712,0.323175,0.314401,0.255872,0.102544,0.525343,0.0,2.0,1.0,1.0,2.0
2,1574063000.0,0.392669,0.405879,0.67547,0.64898,0.662823,0.384095,0.682535,0.323175,0.314401,0.250953,0.102111,0.525343,1.0,2.0,1.0,1.0,2.0
3,1574063000.0,0.392669,0.405879,0.738399,0.64898,0.662823,0.384095,0.661528,0.323175,0.314401,0.250953,0.092843,0.525343,0.0,2.0,1.0,1.0,2.0
8,1574063000.0,0.392669,0.405879,1.24192,0.64898,0.662823,0.384095,0.827166,0.323175,0.314401,0.250953,0.032241,0.525343,0.0,2.0,1.0,1.0,2.0


In [11]:
print(len(df_clean))
print(len(df_good_signal))

1187
1151


## Compute baseline

In [22]:
# Find timestamps for last minute of recording
begin_last_min = max(df_good_signal.timestamps) - 60
# find index of first instance within that minute
begin_idx = (df_good_signal.timestamps.values >= begin_last_min).argmax()
# Split dataframe
df_min2 = df_good_signal.iloc[begin_idx:].reset_index(drop=True)
df_min2.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink,hsi_precision_1,hsi_precision_2,hsi_precision_3,hsi_precision_4
0,1574063000.0,0.501053,0.885907,0.634912,0.974752,0.593194,0.797893,0.76897,0.731141,0.281432,0.259241,-0.041114,-0.020648,0.0,2.0,1.0,1.0,2.0
1,1574063000.0,0.501053,0.885907,0.663374,0.974752,0.593194,0.797893,0.746901,0.731141,0.281432,0.259241,-0.061932,-0.020648,0.0,2.0,1.0,1.0,2.0
2,1574063000.0,0.501053,0.885907,0.664307,0.974752,0.593194,0.797893,0.698971,0.731141,0.281432,0.259241,-0.081847,-0.020648,0.0,2.0,1.0,1.0,2.0
3,1574063000.0,0.501053,0.885907,0.631839,0.974752,0.593194,0.797893,0.658324,0.731141,0.281432,0.259241,-0.098102,-0.020648,0.0,2.0,1.0,1.0,2.0
4,1574063000.0,0.501053,0.885907,0.583162,0.974752,0.593194,0.797893,0.591038,0.731141,0.281432,0.259241,-0.110754,-0.020648,0.0,2.0,1.0,1.0,2.0


In [42]:
# blinkrate per sec
df_blink = df_min2[['timestamps', 'blink']]
df_blink['timestamps'] = df_blink['timestamps'].astype(int)
df_bps = df_blink.groupby(by="timestamps").aggregate('mean')
baseline_bps = np.mean(df_bps['blink'])
baseline_bps

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0.05464480874316941

In [46]:
# Baseline: Mean of 2nd minute

baseline_freq_list = [df_good_signal.theta_absolute_1.mean(), df_good_signal.theta_absolute_2.mean(),
                      df_good_signal.theta_absolute_3.mean(), df_good_signal.theta_absolute_4.mean(),
                      df_good_signal.alpha_absolute_1.mean(), df_good_signal.alpha_absolute_2.mean(),
                      df_good_signal.alpha_absolute_3.mean(), df_good_signal.alpha_absolute_4.mean(),
                      df_good_signal.beta_absolute_1.mean(), df_good_signal.beta_absolute_2.mean(),
                      df_good_signal.beta_absolute_3.mean(), df_good_signal.beta_absolute_4.mean()] 

baseline_df = pd.DataFrame([baseline_freq_list], columns=["theta_absolute_1", "theta_absolute_2", 
                                                     "theta_absolute_3", "theta_absolute_4",
                                                    "alpha_absolute_1", "alpha_absolute_2",
                                                    "alpha_absolute_3", "alpha_absolute_4",
                                                    "beta_absolute_1", "beta_absolute_2",
                                                    "beta_absolute_3", "beta_absolute_4"])
baseline_df['blink'] = baseline_bps
baseline_df


Unnamed: 0,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,blink
0,0.469575,0.50324,0.317156,0.428627,0.586599,0.601921,0.436369,0.521814,0.415052,0.198906,-0.052596,0.392385,0.054645


------
# Need to fix the part below
-----

In [36]:
df_blink = df[[cols[0], cols[-1]]].dropna().reset_index(drop=True)
df_blink.head()

Unnamed: 0,timestamps,blink
0,1574063000.0,0.0
1,1574063000.0,0.0
2,1574063000.0,1.0
3,1574063000.0,0.0
4,1574063000.0,0.0


In [55]:
df_theta = df[[cols[0], cols[1], cols[2], cols[3], cols[4]]].dropna().reset_index(drop=True)
df_theta['theta_median'] = df_theta.iloc[:, 1:].median(axis=1)
df_theta.head()

Unnamed: 0,timestamps,theta_absolute_1,theta_absolute_2,theta_absolute_3,theta_absolute_4,theta_median
0,1574063000.0,0.392669,0.341194,0.575521,0.64898,0.484095
1,1574063000.0,0.392669,0.365021,0.620934,0.64898,0.506802
2,1574063000.0,0.392669,0.405879,0.67547,0.64898,0.527429
3,1574063000.0,0.392669,0.405879,0.738399,0.64898,0.527429
4,1574063000.0,0.392669,0.405879,0.811682,0.64898,0.527429


In [56]:
df_alpha = df[[cols[0], cols[5], cols[6], cols[7], cols[8]]].dropna().reset_index(drop=True)
df_alpha['alpha_median'] = df_alpha.iloc[:, 1:].median(axis=1)
df_alpha.head()

Unnamed: 0,timestamps,alpha_absolute_1,alpha_absolute_2,alpha_absolute_3,alpha_absolute_4,alpha_median
0,1574063000.0,0.662823,0.426493,0.702633,0.323175,0.544658
1,1574063000.0,0.662823,0.410632,0.698712,0.323175,0.536728
2,1574063000.0,0.662823,0.384095,0.682535,0.323175,0.523459
3,1574063000.0,0.662823,0.384095,0.661528,0.323175,0.522811
4,1574063000.0,0.662823,0.384095,0.651961,0.323175,0.518028


In [57]:
df_beta = df[[cols[0], cols[9], cols[10], cols[11], cols[12]]].dropna().reset_index(drop=True)
df_beta['beta_median'] = df_beta.iloc[:, 1:].median(axis=1)
df_beta.head()

Unnamed: 0,timestamps,beta_absolute_1,beta_absolute_2,beta_absolute_3,beta_absolute_4,beta_median
0,1574063000.0,0.314401,0.269253,0.094549,0.525343,0.291827
1,1574063000.0,0.314401,0.255872,0.102544,0.525343,0.285137
2,1574063000.0,0.314401,0.250953,0.102111,0.525343,0.282677
3,1574063000.0,0.314401,0.250953,0.092843,0.525343,0.282677
4,1574063000.0,0.314401,0.250953,0.075691,0.525343,0.282677


In [68]:
medians_df = pd.concat([df_beta['beta_median'], df_alpha['alpha_median'], df_theta['theta_median']], axis=1)
medians_df.head()

Unnamed: 0,beta_median,alpha_median,theta_median
0,0.291827,0.544658,0.484095
1,0.285137,0.536728,0.506802
2,0.282677,0.523459,0.527429
3,0.282677,0.522811,0.527429
4,0.282677,0.518028,0.527429


In [71]:
medians_df['task_engagement'] = medians_df['beta_median']/(medians_df['alpha_median'] + medians_df['theta_median'])
medians_df['task_difficulty'] = medians_df['theta_median']/(medians_df['alpha_median'] + medians_df['beta_median'])
medians_df.head()

Unnamed: 0,beta_median,alpha_median,theta_median,task_engagement,task_difficulty
0,0.291827,0.544658,0.484095,0.283671,0.578725
1,0.285137,0.536728,0.506802,0.273243,0.616649
2,0.282677,0.523459,0.527429,0.268989,0.654269
3,0.282677,0.522811,0.527429,0.269154,0.654795
4,0.282677,0.518028,0.527429,0.270386,0.658706


In [31]:
if task == 'programming':
    duration = 6
elif task == 'break':
    duration = 2

In [30]:
def get_values_per_min(df, num_minutes): 
    return int(len(df)/num_minutes)

In [59]:
start_idx = 0
end_idx = get_values_per_min(df_beta, duration)

In [81]:
from pathlib import Path
from os import rename, rmdir
import re
from zipfile import ZipFile
import pandas as pd


# Works with Luka's naming scheme
def unzip():
    for filename in Path('data').rglob('*.zip'):
        parts = filename.parts
        with ZipFile(filename, 'r') as zipObj:
            zipObj.extractall('tmp')
        name = parts[-1]
        if re.search(r'.*break.*',name):
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'break\1_condition\2_\3.csv',name)
        else:
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'programming\1_condition\2_\3.csv',name)

        for museFile in Path('tmp').rglob('*'):
            p = museFile.parts
            rename(museFile,'data/' + name)
    rmdir('tmp')
    

# Replaces the 'XxX' placeholder with condition number e.g. focus music = 1, office = 4 ...
def addConditions(filename, participant_id):
    conditionOrder = re.search(r'.*([0-9])_.*',filename)
    conditionOrder = int(conditionOrder.group(1))
    order = getMusicOrderList()[int(participant_id) - 1] # -1 ==> the ordering of lists starts with 0
    return re.sub(r'XxX',str(order[conditionOrder - 1]),filename)
    
    

# Normalizing the naming scheme of files from Kathrin
def renameKathrin():
    for filename in Path('data').rglob('*.csv'):
        name = filename.parts[-1]
        if re.search(r'P.*',name):
            name = re.sub(r'^P([0-9]{2})_([0-9])_(.*)\.csv$', r'\3\2_conditionXxX_ID\1.csv',name)
            rename(filename,'data/'+name)
            
            # Normalize the ID numbering
            participant_id = re.search(r'.*ID([0-9][0-9]?)',name)
            participant_id = participant_id.group(1)
            participant_id = int(participant_id)            
            participant_id = str(participant_id)
            newName = re.sub(r'ID[0-9]{2}',"ID"+participant_id,name)
            
            newName = addConditions(newName, participant_id) # adds condition into the naming scheme
            rename('data/'+name,'data/' + newName)
            
    
            
# P08_1_break.csv
            
            
def getMusicOrderList():
    musicOrderList = []
    musicOrder_df = pd.read_csv('music_lookup.csv',sep=';')
    musicOrder_df.dropna(axis=0, thresh=3, inplace=True) # Tresh = 3 ==> rows/cols have 2 NA
    musicOrder_df.dropna(axis=1, thresh=3, inplace=True)

    for i in range(musicOrder_df.shape[0]):
        row = musicOrder_df.iloc[i]
        row = list(row)
        musicOrderList.append([int(condition_num) for condition_num in row[1:5]]) # '1:5' because we have 4 conditions
    return musicOrderList
        

