In [1]:
import os
import pandas as pd
import numpy as np

# 1. Preparation and Data Loading

### Defining the relevant column names and directories

## Split the filename to get participant ID, task number and programming or break condition

In [2]:
def extract_file_info(filename):
    split_name = filename.split('_')
    p_id = split_name[0]
    task_no = split_name[1]
    task = split_name[2].split('.')[0]

    return p_id, task_no, task

## Load and clean data

- Remove unnecessary columns
- Drop empty rows
- Filter out rows with a bad signal (hsi_precision > 2)

In [3]:
def load_and_clean_file(path_to_file, filename):
    df = pd.read_csv(path_to_file + filename)
    # Remove unnecessary columns and drop empty rows
    df_clean = df[[col for col in cols]].dropna(thresh=2).reset_index(drop=True)
    # Group every 5 rows together to one row
    df_clean = df_clean.groupby(np.arange(len(df_clean)) // 5).agg({'timestamps':'mean',
                                                                'theta_absolute_1':'mean',
                                                                'theta_absolute_2':'mean',
                                                                'theta_absolute_3':'mean',
                                                                'theta_absolute_4':'mean',
                                                                'alpha_absolute_1': 'mean',
                                                                'alpha_absolute_2': 'mean',
                                                                'alpha_absolute_3': 'mean',
                                                                'alpha_absolute_4': 'mean',
                                                                'beta_absolute_1': 'mean',
                                                                'beta_absolute_2': 'mean',
                                                                'beta_absolute_3': 'mean',
                                                                'beta_absolute_4': 'mean',
                                                                'blink': 'mean',
                                                                'hsi_precision_1': 'mean',
                                                                'hsi_precision_2': 'mean',
                                                                'hsi_precision_3': 'mean',
                                                                'hsi_precision_4': 'mean'
                                                               })
    # remove rows with bad signal
    df_good_signal = df_clean[(df_clean.hsi_precision_1 < 3) & (df_clean.hsi_precision_2 < 3) & 
                          (df_clean.hsi_precision_3 < 3) & (df_clean.hsi_precision_4 < 3)]
    
    return df_good_signal[['timestamps', 'theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3',
                           'theta_absolute_4', 'alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 
                           'alpha_absolute_4', 'beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3',
                           'beta_absolute_4', 'blink']]

## Compute baseline

In [4]:
def filter_out_2nd_min(clean_df_break):
    
    # Find timestamps for last minute of recording
    begin_last_min = max(clean_df_break.timestamps) - 60
    # find index of first instance within that minute
    begin_idx = (clean_df_break.timestamps.values >= begin_last_min).argmax()
    # Split dataframe
    df_min2 = clean_df_break.iloc[begin_idx:].reset_index(drop=True)
    
    return df_min2

In [5]:
def compute_blink_per_sec(clean_df):
    # blinkrate per sec
    df_blink = clean_df[['timestamps', 'blink']]
    df_blink['timestamps'] = df_blink['timestamps'].astype(int)
    df_bps = df_blink.groupby(by="timestamps").aggregate('mean')
    
    return df_bps

def compute_blink_baseline(df_bps):
    return np.mean(df_bps['blink'])

In [22]:
def compute_eeg_baselines(clean_df):
    # Baseline: Mean of 2nd minute

    baseline_freq_list = [clean_df.theta_absolute_1.mean(), clean_df.theta_absolute_2.mean(),
                          clean_df.theta_absolute_3.mean(), clean_df.theta_absolute_4.mean(),
                          clean_df.alpha_absolute_1.mean(), clean_df.alpha_absolute_2.mean(),
                          clean_df.alpha_absolute_3.mean(), clean_df.alpha_absolute_4.mean(),
                          clean_df.beta_absolute_1.mean(), clean_df.beta_absolute_2.mean(),
                          clean_df.beta_absolute_3.mean(), clean_df.beta_absolute_4.mean()] 

    baseline_df = pd.DataFrame([baseline_freq_list], columns=["theta_absolute_1", "theta_absolute_2",
                                                              "theta_absolute_3", "theta_absolute_4",
                                                              "alpha_absolute_1", "alpha_absolute_2",
                                                              "alpha_absolute_3", "alpha_absolute_4",
                                                              "beta_absolute_1", "beta_absolute_2",
                                                              "beta_absolute_3", "beta_absolute_4"])
#     baseline_df['blink'] = blink_base
    
    return baseline_df


In [7]:
def compute_baseline(clean_df_break):
    min2_df = filter_out_2nd_min(clean_df_break)
    blinks = compute_blink_per_sec(min2_df)
    blinks = compute_blink_baseline(blinks)
    eegs = compute_eeg_baselines(min2_df)
    return blinks, eegs

In [8]:
def load_music_map(filename):
    music_conditions = pd.read_csv(filename, sep=';')
    music_conditions = music_conditions[['ID', 'Music 1', 'Music 2', 'Music 3', 'Music 4']]
    return music_conditions

In [16]:
def extract_music_condition(music_file, p_id, task):
    music_df = load_music_map(music_file)
    music_condition = music_df.loc[music_df["ID"] == p_id]
    return int(music_condition.iloc[:, int(task)])

In [10]:
def normalize_data(break_df, cleaned_df):
    # get 1 sec averages
    cleaned_df['timestamps'] = cleaned_df['timestamps'].astype(int)
    grouped_df = cleaned_df.groupby(by='timestamps', as_index=False).aggregate('mean')
    blink_base, eeg_base = compute_baseline(break_df)
    # normalize blink
    grouped_df['blink'] = grouped_df['blink'] - blink_base
    # normalize eeg
    for col in cols[1:13]:
        grouped_df[col] = grouped_df[col] - eeg_base[col].values[0]
    # get avg for 10 secs interval -> remove last digit and group by timestamp?
    grouped_df['timestamps'] = grouped_df['timestamps'].astype(str).str[:-1].astype(np.int64)
    df_10_secs = grouped_df.groupby(by='timestamps', as_index=False).aggregate('mean')
    
    return df_10_secs

In [11]:
def compute_tei_tdi(normalized_df):
    # avg values of sensors and compute tdi and tei
    theta = normalized_df[['theta_absolute_1', 'theta_absolute_2', 
                           'theta_absolute_3', 'theta_absolute_4']].mean(axis=1)
    alpha = normalized_df[['alpha_absolute_1', 'alpha_absolute_2', 
                           'alpha_absolute_3', 'alpha_absolute_4']].mean(axis=1)
    beta = normalized_df[['beta_absolute_1', 'beta_absolute_2', 
                          'beta_absolute_3', 'beta_absolute_4']].mean(axis=1)
    tei = beta / (alpha + theta)
    tdi = theta / (alpha + beta)
    
    return tei.values.tolist(), tdi.values.tolist()

In [52]:
def pad_results(max_len, results):
    padded_index = []
    for r in results:
        if len(r) < (max_len + 3):
            r = r + [np.nan] * (max_len - len(r) - 3)
        padded_index.append(r)
    return padded_index

## Filenames, IDs, Columnnames, etc

In [12]:
p_ids = ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
task_numbers = [1, 2, 3, 4]
music_mapping_file = 'music_lookup.csv'

In [13]:
directory = 'data/'

data_files = os.listdir(directory)
cols = ['timestamps', 'theta_absolute_1', 'theta_absolute_2', 'theta_absolute_3', 'theta_absolute_4',
            'alpha_absolute_1', 'alpha_absolute_2', 'alpha_absolute_3', 'alpha_absolute_4',
            'beta_absolute_1', 'beta_absolute_2', 'beta_absolute_3', 'beta_absolute_4',
            'blink','hsi_precision_1', 'hsi_precision_2', 'hsi_precision_3', 'hsi_precision_4']

------
# Trial and error starts here
-----

In [29]:
tei_results = []
tdi_results = []
blink_results = []
max_tei_len = 0
min_tei_len = 999
for f in data_files:
    if f.startswith('P'):
        p_id, task, condition = extract_file_info(f)
        music_condition = extract_music_condition(music_mapping_file, p_id, task)
        if condition == 'programming':
            result_prefix = [p_id, task, music_condition]
            print('loading:')
            print('participant', p_id)
            print('programming task', task)
            print('music condition', music_condition)
            print(condition)
            print()
            cleaned_data = load_and_clean_file(directory, f)
            base_file = p_id + '_' + task + '_break.csv'
            print('loading baseline file', base_file)
            baseline_df = load_and_clean_file(directory, base_file)
            print('normalizing the data')
            print()
            print()
            normalized_df = normalize_data(baseline_df, cleaned_data)
            blink_rates = normalized_df['blink'].values.tolist()
            tei, tdi = compute_tei_tdi(normalized_df)
            if len(tei) > max_tei_len:
                max_tei_len = len(tei)
            if len(tei) < min_tei_len:
                min_tei_len = len(tei)
            blink_results.append(result_prefix + blink_rates)
            tei_results.append(result_prefix + tei)
            tdi_results.append(result_prefix + tdi)

P07_1_break.csv
P09_4_break.csv
P07_4_programming.csv
loading:
participant P07
programming task 4
music condition 4
programming

loading baseline file P07_4_break.csv
normalizing the data
.DS_Store
P07_3_programming.csv
loading:
participant P07
programming task 3
music condition 3
programming



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


loading baseline file P07_3_break.csv
normalizing the data
P09_2_break.csv
P01_2_programming.csv
loading:
participant P01
programming task 2
music condition 1
programming



  if (yield from self.run_code(code, result)):


loading baseline file P01_2_break.csv
normalizing the data
P07_2_programming.csv
loading:
participant P07
programming task 2
music condition 1
programming

loading baseline file P07_2_break.csv
normalizing the data
P01_1_break.csv
P09_1_programming.csv
loading:
participant P09
programming task 1
music condition 2
programming

loading baseline file P09_1_break.csv
normalizing the data
P01_3_programming.csv
loading:
participant P01
programming task 3
music condition 2
programming



  if (yield from self.run_code(code, result)):


loading baseline file P01_3_break.csv
normalizing the data
P08_1_break.csv
P08_1_programming.csv
loading:
participant P08
programming task 1
music condition 4
programming

loading baseline file P08_1_break.csv
normalizing the data
P09_3_break.csv
P01_4_programming.csv
loading:
participant P01
programming task 4
music condition 4
programming

loading baseline file P01_4_break.csv
normalizing the data
P09_3_programming.csv
loading:
participant P09
programming task 3
music condition 1
programming

loading baseline file P09_3_break.csv
normalizing the data
P01_2_break.csv
P07_3_break.csv
P08_2_break.csv
P01_1_programming.csv
loading:
participant P01
programming task 1
music condition 3
programming

loading baseline file P01_1_break.csv
normalizing the data
P08_4_programming.csv
loading:
participant P08
programming task 4
music condition 1
programming

loading baseline file P08_4_break.csv
normalizing the data
P08_3_programming.csv
loading:
participant P08
programming task 3
music condition

In [55]:
padded_tei = pad_results(max_tei_len, tei_results)
padded_tdi = pad_results(max_tei_len, tdi_results)
padded_blinks = pad_results(max_tei_len, blink_results)

In [56]:
df_cols = ['p_id', 'programming_task', 'music_cond'] + ['intervall_' + str(i) for i in range(max_tei_len)]
tei_df = pd.DataFrame(padded_tei, columns=df_cols)
tdi_df = pd.DataFrame(padded_tdi, columns=df_cols)
blink_df = pd.DataFrame(padded_blinks, columns=df_cols)

In [57]:
blink_df.to_csv('blink_results.csv', index=None)
tdi_df.to_csv('tdi_results.csv', index=None)
tei_df.bli.to_csv('blink_results.csv', index=None)

Unnamed: 0,p_id,programming_task,music_cond,intervall_0,intervall_1,intervall_2,intervall_3,intervall_4,intervall_5,intervall_6,...,intervall_28,intervall_29,intervall_30,intervall_31,intervall_32,intervall_33,intervall_34,intervall_35,intervall_36,intervall_37
0,P07,4,4,-0.335703,-0.102175,-0.266104,-0.285548,0.146793,-0.370271,-0.370271,...,,,,,,,,,,
1,P07,3,3,0.096995,0.089217,0.14255,0.220328,0.412735,0.257828,0.311016,...,-0.02745,0.000328,-0.058561,0.195883,0.215328,0.026995,0.215883,0.229217,0.024772,
2,P01,2,1,0.0,0.0,0.0,0.03,0.0,0.01,0.0,...,0.011111,0.01,0.0,0.052222,0.091111,0.011111,0.0,0.0,0.0,0.028571
3,P07,2,1,-0.244444,-0.205556,-0.135556,-0.185278,-0.217778,-0.235556,-0.247778,...,0.383056,0.368889,0.256667,0.207778,0.334444,0.296111,0.224444,0.207407,,
4,P09,1,2,0.013207,0.008762,0.009873,0.137651,0.05654,0.072096,0.042651,...,,,,,,,,,,
5,P01,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.02,0.01,0.021111,0.05,0.05,0.093333,0.0,0.0,0.0
6,P08,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.031111,0.022222,0.043333,0.0,0.0,0.0,0.0,0.0,
7,P01,4,4,0.016667,0.01,0.05,0.0975,0.053333,0.031111,0.01,...,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.032222,0.0,
8,P09,3,1,,,,,,,,...,,,,,,,,,,
9,P01,1,3,0.05,0.072222,0.0,0.0,0.1,0.0,0.05,...,0.072222,0.06,0.03,0.0,0.051111,0.101111,0.05,0.01,0.044444,0.074074


In [154]:
df_cols = ['p_id', 'programming_task', 'music_cond'] + ['intervall_' + str(i) for i in range(len(tei))]
tei_frame_prog = pd.DataFrame([[p_id, task, music_type] + tei.values.tolist()], 
                              columns=df_cols)
tei_frame_prog

Unnamed: 0,p_id,programming_task,music_cond,intervall_0,intervall_1,intervall_2,intervall_3,intervall_4,intervall_5,intervall_6,...,intervall_15,intervall_16,intervall_17,intervall_18,intervall_19,intervall_20,intervall_21,intervall_22,intervall_23,intervall_24
0,P07,4,4,0.028121,0.108829,3.026472,0.276594,0.055898,0.138136,-0.273455,...,1.068193,1.563051,5.934927,0.904877,0.576687,0.790801,34.393015,6.282431,2.32352,0.962543


In [155]:
tdi_frame_prog = pd.DataFrame([[p_id, task, music_type] + tei.values.tolist()], 
                              columns=df_cols)
tei_frame_prog

Unnamed: 0,p_id,programming_task,music_cond,intervall_0,intervall_1,intervall_2,intervall_3,intervall_4,intervall_5,intervall_6,...,intervall_15,intervall_16,intervall_17,intervall_18,intervall_19,intervall_20,intervall_21,intervall_22,intervall_23,intervall_24
0,P07,4,4,0.028121,0.108829,3.026472,0.276594,0.055898,0.138136,-0.273455,...,1.068193,1.563051,5.934927,0.904877,0.576687,0.790801,34.393015,6.282431,2.32352,0.962543


In [131]:
print(len(alpha))
print(len(tdi))
print(len(df_10_secs))

25
25
25


---
# Renaming of files after download from SwitchDrive
---

In [81]:
from pathlib import Path
from os import rename, rmdir
import re
from zipfile import ZipFile
import pandas as pd


# Works with Luka's naming scheme
def unzip():
    for filename in Path('data').rglob('*.zip'):
        parts = filename.parts
        with ZipFile(filename, 'r') as zipObj:
            zipObj.extractall('tmp')
        name = parts[-1]
        if re.search(r'.*break.*',name):
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'break\1_condition\2_\3.csv',name)
        else:
            name = re.sub(r'([0-9]).*dition ([0-9]).*(ID[0-9][0-9]?)\.zip', r'programming\1_condition\2_\3.csv',name)

        for museFile in Path('tmp').rglob('*'):
            p = museFile.parts
            rename(museFile,'data/' + name)
    rmdir('tmp')
    

# Replaces the 'XxX' placeholder with condition number e.g. focus music = 1, office = 4 ...
def addConditions(filename, participant_id):
    conditionOrder = re.search(r'.*([0-9])_.*',filename)
    conditionOrder = int(conditionOrder.group(1))
    order = getMusicOrderList()[int(participant_id) - 1] # -1 ==> the ordering of lists starts with 0
    return re.sub(r'XxX',str(order[conditionOrder - 1]),filename)
    
    

# Normalizing the naming scheme of files from Kathrin
def renameKathrin():
    for filename in Path('data').rglob('*.csv'):
        name = filename.parts[-1]
        if re.search(r'P.*',name):
            name = re.sub(r'^P([0-9]{2})_([0-9])_(.*)\.csv$', r'\3\2_conditionXxX_ID\1.csv',name)
            rename(filename,'data/'+name)
            
            # Normalize the ID numbering
            participant_id = re.search(r'.*ID([0-9][0-9]?)',name)
            participant_id = participant_id.group(1)
            participant_id = int(participant_id)            
            participant_id = str(participant_id)
            newName = re.sub(r'ID[0-9]{2}',"ID"+participant_id,name)
            
            newName = addConditions(newName, participant_id) # adds condition into the naming scheme
            rename('data/'+name,'data/' + newName)
            
    
            
# P08_1_break.csv
            
            
def getMusicOrderList():
    musicOrderList = []
    musicOrder_df = pd.read_csv('music_lookup.csv',sep=';')
    musicOrder_df.dropna(axis=0, thresh=3, inplace=True) # Tresh = 3 ==> rows/cols have 2 NA
    musicOrder_df.dropna(axis=1, thresh=3, inplace=True)

    for i in range(musicOrder_df.shape[0]):
        row = musicOrder_df.iloc[i]
        row = list(row)
        musicOrderList.append([int(condition_num) for condition_num in row[1:5]]) # '1:5' because we have 4 conditions
    return musicOrderList
        

