In [3]:
import json
import os
import pandas as pd
import numpy as np
from scipy.signal import iirnotch, butter, filtfilt
from mne.preprocessing import ICA, create_eog_epochs, create_ecg_epochs
from mne.time_frequency import tfr_array_morlet
import mne
from library import *


TOLERANCE = float('2.0')

def load_one():
    data = {'_eeg': [], '_ppg': [], '_telem': [], '_events': [], '_trialData': [], '_pids':[]}
    x = json.load(open(ONE_FILE))
    for key in x.keys():
        data[key].append(x[key])
    data['_pids'].append(ONE_FILE.split('_')[0])
    return data

def load_pfiles():
    data = {'_eeg': [], '_ppg': [], '_telem': [], '_events': [], '_trialData': [], '_pids':[]}
    files = [os.path.join(DATADIR, f) for f in os.listdir(DATADIR) if os.path.isfile(os.path.join(DATADIR, f))]
    
    for f in files:
    
        x = json.load(open(f))
        for key in x.keys():
            data[key].append(x[key])
        
        pid = f.split('/')[-1].split('_')[0]
        if pid[0] == '0':
            pid = '1' + pid[1:]
            
        data['_pids'].append(pid)
    
    return data


def create_ppg_df(eeg_data):
    # data is currently in a list of each of the items in long format; put them into a single dataframe.
    PPGRAW = { 'ppgChannel':[], 'index': [], 'samples':[], 'timestamp':[] }
    for reading in eeg_data:
        for key, value in reading.items():
            PPGRAW[key].append(value)
    
    if len(PPGRAW['index']) == 0:
        raise Exception('No EEG data found in this file')

    return pd.DataFrame.from_dict(PPGRAW)

def create_eeg_df(eeg_data):
    # data is currently in a list of each of the items in long format; put them into a single dataframe.
    EEGRAW = { 'electrode':[], 'index': [], 'samples':[], 'timestamp':[] }
    for reading in eeg_data:
        for key, value in reading.items():
            EEGRAW[key].append(value)
    
    if len(EEGRAW['index']) == 0:
        raise Exception('No EEG data found in this file')

    return pd.DataFrame.from_dict(EEGRAW)

def create_event_df(event_data):
    eventDF = { 'value':[], 'timestamp':[] }
    for e in event_data:
        if 'value' in e and 'timestamp' in e and not 'mouseover' in e['value'] and not 'mouseout' in e['value']:
            eventDF['value'].append(e['value'])
            eventDF['timestamp'].append(e['timestamp'])

    eventDF = pd.DataFrame.from_dict(eventDF)
    eventDF['timestamp'] = eventDF['timestamp'].astype(float)

    if len(eventDF) == 0:
        raise Exception('No events found in this file')

    return eventDF



def widen_ppg_data(PPG_LONG, EEG_LONG):    
    PPG_SAMPLING_RATE = 64    
    start_time = EEG_LONG.loc[0, 'timestamp'] # let's assume eeg and ppg start at the same time.
    reading_offset = 1000 / SAMPLING_RATE
    n_samples = 6 
    n_probes = 3

    pivoted_df = PPG_LONG.pivot(index='index', columns='ppgChannel', values='samples')
    pivoted_df.columns = [f'ppgChan-{col}' for col in pivoted_df.columns]    
    wide_df = pivoted_df.apply(lambda x: x.explode()).reset_index().drop(columns=['index'])

    # add the timestamp; data is coming in at 64 h
    wide_df['timestamp'] = [start_time + reading_offset * i for i in range(len(wide_df))]
    return wide_df

def widen_eeg_data(EEG_LONG):    
    # We know the data is coming in at 256hz. 
    start_time = EEG_LONG.loc[0,'timestamp']
    reading_offset = 1000 / SAMPLING_RATE
    n_samples = 12 
    n_probes = 4

    pivoted_df = EEG_LONG.pivot(index='index', columns='electrode', values='samples')
    pivoted_df.columns = [f'probe-{col}' for col in pivoted_df.columns]
    # print(pivoted_df[pivoted_df.index.duplicated()])
    # print(pivoted_df)
    wide_df = pivoted_df.apply(lambda x: x.explode()).reset_index().drop(columns=['index'])
    print(wide_df)
    
    # add the timestamp; data is coming in at 256hz.
    wide_df['timestamp'] = [start_time + reading_offset * i for i in range(len(wide_df))]
    return wide_df

def merge_dfs(EEG_WIDE, EVENT_DF, tolerance=TOLERANCE):
    
    # merge the events with the data
    df = pd.merge_asof(EEG_WIDE, EVENT_DF, on="timestamp", direction="nearest", tolerance=tolerance)
    
    # only keep the first encountered marker of each type. with 2ms tolerance that should be close enough. 
    df['value'] = df['value'].drop_duplicates()
    df['value'] = df['value'].astype(str)

    # ground truth how many puzzle solved events -- background info in case the merge has too low of a tolerance
    events_gt = sum([1 for e in EVENT_DF['value'].unique() if 'puzzle_finished' in e or 'puzzle_loaded' in e])
    events_eeg = sum([1 for e in df['value'] if 'puzzle_finished' in e or 'puzzle_loaded' in e])
    print(f'Found {events_gt} start/stop events, {events_eeg} start/stop eeg events [THESE SHOULD BE THE SAME]')
    if events_gt != events_eeg:
        print(EVENT_DF['value'].unique())
        raise Exception(f'Mismatch in number of start/stop events: {events_gt} vs {events_eeg}')

    return df


DATA = load_one() if ONE_FILE else load_pfiles()

num_files = len(DATA['_eeg'])

n_passed = 0
n_failed = 0
MERGED_DFS = []
for i in range(num_files):
    print(i)
    try:   
        
        EVENT_DF = create_event_df(DATA['_events'][i])
        
        EEG_LONG = create_eeg_df(DATA['_eeg'][i])    
        #PPG_LONG = create_ppg_df(DATA['_ppg'][i])
        
        #PPG_WIDE = widen_ppg_data(PPG_LONG, EEG_LONG)        
        EEG_WIDE = widen_eeg_data(EEG_LONG) 
        MERGED_DF = merge_dfs(EEG_WIDE, EVENT_DF) 
        #MERGED_DF = merge_dfs(PPG_WIDE, EVENT_DF, tolerance=TOLERANCE*100)

    except Exception as e:
        print(e)
        n_failed += 1
        continue
   
    MERGED_DF['pid'] = DATA['_pids'][i]
    print(MERGED_DF)

    MERGED_DFS.append(MERGED_DF)
    n_passed += 1

MERGED_DF = pd.concat(MERGED_DFS)
MERGED_DF.to_csv(DBFNAME, index=False)


0
          probe-0     probe-1     probe-2    probe-3
0       -0.488281           0   -0.488281          0
1               0   -2.441406    1.464844  -0.488281
2        1.464844    7.324219    19.53125          0
3        -3.90625  -27.832031  -94.238281   0.976562
4       20.996094   49.804688  443.359375   0.976562
...           ...         ...         ...        ...
116923 -26.367188 -518.066406      -312.5 -42.480469
116924 -30.273438   94.726562  -38.574219 -17.578125
116925 -28.320312  290.527344   57.617188   -3.90625
116926 -27.832031 -112.304688 -168.945312 -25.390625
116927 -38.085938  -11.230469  -80.566406 -54.199219

[116928 rows x 4 columns]
Found 60 start/stop events, 60 start/stop eeg events [THESE SHOULD BE THE SAME]
          probe-0     probe-1     probe-2    probe-3     timestamp value  \
0       -0.488281           0   -0.488281          0  1.683055e+12   nan   
1               0   -2.441406    1.464844  -0.488281  1.683055e+12   nan   
2        1.464844    7.3242