In [1]:
import sys, os, glob, re
import threading
import pypillometry as pp
import pandas as pd
import numpy as np
import pylab as plt
import scipy.io
import scipy.signal
from scipy.interpolate import interp1d
import math
import bisect
import numexpr as ne

In [2]:
## define functions to load in data
def get_txt_files(directory):
    os.chdir(directory)
    txt_files = []
    for file in glob.glob("*.txt"):
        txt_files.append(file)
    return txt_files

def load_pupil_data_from_disk(path_to_file):
    data = []
    with open(path_to_file, 'r') as file:
        for current_line in file:
            if current_line[:1] == '5':     # 5 in the first two letters means this is header
                col_string = current_line
                columnNames = col_string.strip().split('\t')
                if len(columnNames) == 18:
                    columnNames.append('String')
            elif current_line[:2] == '10':  # 10 in the first two letters means this is data
                data_list = current_line.split('\t')
                # Remove any empty strings from the list
                data_list = [i for i in data_list if i]
                
                if len(data_list[1:]) < 18:
                    data_list.append([])
                data.append(data_list[1:])

    df = pd.DataFrame(data, columns=columnNames[1:])
    # replace new line marker to nans
    df['Marker'] = df['Marker'].replace('\n', np.nan)

    # replace 'None' with nans
    df = df.replace('None', np.nan)

    # count occurences of Markers
    df['Marker_Count'] = df.groupby('Marker').cumcount() + 1

    ## add block and run information to df
    block_start_list = df.loc[df['Marker'] == 'B'].index.tolist()
    block_end_list = df.loc[df['Marker'] == 'E'].index.tolist()
    run_start_list = df.loc[df['Marker'] == 'R'].index.tolist()
    # create empty columns
    df['Block'] = np.nan
    df['Run'] = np.nan
    # go through blocks
    for idx,_ in enumerate(block_start_list):
        try:
            df.loc[block_start_list[idx]:block_end_list[idx], 'Block'] = df.loc[block_start_list[idx],'Marker_Count']
        except:
            endidx = df.index.to_list()
            df.loc[block_start_list[idx]:endidx[-1], 'Block'] = df.loc[block_start_list[idx],'Marker_Count']
    # go through runs

    match = re.search('run(\d+)', path_to_file)
    if match:
        number_run = match.group(1)

    # df.loc[run_start_list[idx],'Marker_Count'].astype(int)

    for idx,_ in enumerate(run_start_list):

        if idx < len(run_start_list)-1:
            # get end of last block in run
            pos_end = bisect.bisect_right(block_end_list, run_start_list[idx+1])
            closest_end = block_end_list[pos_end-1] if pos_end else None

            df.loc[run_start_list[idx]:closest_end, 'Run'] = df.loc[run_start_list[idx],'Marker_Count'] + (int(number_run)-1)
        else:
            df.loc[run_start_list[idx]:block_end_list[-1], 'Run'] = df.loc[run_start_list[idx],'Marker_Count'] + (int(number_run)-1)

    return df




def create_df_prepross(df, sub_name): # event_df_no_nans, 

    df['PupilHeight'] = df['PupilHeight'].astype(float)
    df['PupilWidth'] = df['PupilWidth'].astype(float)
    df['PupilArea'] = math.pi * (df['PupilWidth']/2) * (df['PupilHeight']/2)
    
    new_df = df[['TotalTime', 'PupilArea', 'Block', 'Run', 'Marker', 'Marker_Count']].copy()
    new_df = new_df.rename(columns={'TotalTime': 'time', 'PupilArea': 'pupil'})

    min_run = int(new_df['Run'].min())
    max_run = int(new_df['Run'].max())

    cur_run = min_run

    print(sub_name,' ,runs ',min_run,' to ', max_run)

    # run_data = {}
    run_data = {run: None for run in range(min_run,max_run+1)}
    count = 0
    for cur_run in range(min_run,max_run+1):
        
        # skip skip runs in order to avoid issues with experimens with multiple files
        if ((sub_name == 'S9') and (cur_run == 11) and(min_run == 1)) or ((sub_name == 'S22') and (cur_run == 4) and(min_run == 1)) or ((sub_name == 'S28') and (cur_run == 11) and(min_run == 2)):
            continue

        # select subset of data only containing current run (to avoid drifting pupil data)
        # event_df_run = event_df_no_nans.loc[event_df_no_nans['Run'] == cur_run].copy()

        # event_df_run = event_df_run.loc[event_df_run['event'] != 'O']

        pp_data = new_df.copy()
        pp_data = pp_data.loc[pp_data['Run'] == cur_run]

        pp_data['time'] = pp_data['time'].astype(float) * 1000 # convert sec to ms

        pp_data['time'] = pp_data['time'].astype(float)
        samp = 1000/pp_data['time'].diff().mean()
        # print(samp)

        data_name = sub_name + '-' + str(cur_run)

        pp_data.to_csv('/Users/scanlab/Documents/internship_luca/Pupil_data_Maastricht/pupilFieldtrip/blocks/'+data_name+'.csv') 
    return


    ## define functions to load in data
def get_csv_files(directory):
    os.chdir(directory)
    csv_files = []
    for file in glob.glob("*.csv"):
        csv_files.append(file)
    return csv_files

def time_accurate_resampling(df, target_samp_freq, time_col_name='time'):
    
    # setup time data correctly
    df['timestamp'] = pd.to_datetime(df[time_col_name], unit='ms')
    df = df.set_index('timestamp') 

    # do resampling of time-data
    time_window_sec = 1000/target_samp_freq
    time_window = str(time_window_sec)+'ms' # create string for resampling function

    # df['Marker_int'] = df['Marker'].apply(lambda x: 0 if pd.isna(x) else ord(x) if isinstance(x, str) and len(x) == 1 else 0)
    resampled_df = df.resample(time_window).mean(numeric_only=True) # takes out marker column

    ## perform resampling of pupil data
    # - seconds from start:
    time_seconds = (df.index - df.index[0]).total_seconds()
    # - interpolation function:
    f = interp1d(time_seconds, df['pupil'], kind='cubic', fill_value='extrapolate') 
    # - new timestamps based on resampled index:
    new_time_seconds = (resampled_df.index - resampled_df.index[0]).total_seconds()
    # - interpolate the data:
    resampled_df['pupil_resampled'] = f(new_time_seconds)

    # put marker back into data
    # resampled_df = pd.concat([resampled_df, df.loc[df['Marker'].notna()]], axis=1)
    for t in df.loc[df['Marker'] == 'B'].index:
        time_diff = np.abs(resampled_df.index - t)
        resampled_df.loc[resampled_df.index[np.argmin(time_diff)],'Marker'] = 'B'

    for t in df.loc[df['Marker'] == 'E'].index:
        time_diff = np.abs(resampled_df.index - t)
        resampled_df.loc[resampled_df.index[np.argmin(time_diff)],'Marker'] = 'E'
    

    resampled_df['time_new'] = resampled_df.index
    resampled_df['time_new'] = resampled_df['time_new'].astype(np.int64) / int(1e6)

    # interpolate missing pupil data
    resampled_df['pupil'] = resampled_df['pupil'].interpolate()

    return resampled_df

    def remove(string, i): 
        if i > len(string): 
            return string 
        a = list(string) 
        a.pop(i) 
        return "".join(a) 


# create csv from pupil-data txt

In [38]:
# load available files
rel_dir = "/Users/scanlab/Documents/internship_luca/Pupil_data_Maastricht/testing/eyetracking_data/"
file_names = get_txt_files(rel_dir)
file_names.sort(key=lambda x: x[0:2])

# go through each file and create csv
for file in file_names:

    if file[1] == '_':
        sub_name = 'S'+str(file[0])  # get sub name/number
    else:
        sub_name = 'S'+str(file[:2]) # get sub name/number

    df = load_pupil_data_from_disk(rel_dir+file)

    create_df_prepross(df, sub_name)
    

S10  ,runs  1  to  16
S11  ,runs  1  to  16
S12  ,runs  1  to  16
S13  ,runs  1  to  11
S13  ,runs  11  to  16
S14  ,runs  1  to  16
S15  ,runs  1  to  16
S16  ,runs  1  to  16
S17  ,runs  1  to  16
S18  ,runs  1  to  3
S18  ,runs  4  to  16
S19  ,runs  1  to  16
S1  ,runs  4  to  16
S1  ,runs  1  to  3
S20  ,runs  1  to  16
S21  ,runs  1  to  16
S22  ,runs  4  to  16
S22  ,runs  1  to  4
S23  ,runs  1  to  16
S24  ,runs  1  to  16
S25  ,runs  1  to  16
S26  ,runs  1  to  16
S27  ,runs  1  to  16
S28  ,runs  11  to  16
S28  ,runs  1  to  1
S28  ,runs  2  to  10
S2  ,runs  1  to  16
S3  ,runs  1  to  16
S4  ,runs  1  to  16
S5  ,runs  1  to  16
S6  ,runs  1  to  16
S7  ,runs  1  to  16
S8  ,runs  1  to  16
S9  ,runs  11  to  16
S9  ,runs  1  to  11


# time-accurate downsampling

In [None]:
glob_dir = "/Users/scanlab/Documents/internship_luca/Pupil_data_Maastricht/pupilFieldtrip/blocks/"

file_names = get_csv_files(glob_dir)
file_names.sort(key=lambda x: x)

target_hz = 125

# perform downsampling
for file in file_names:
    df = pd.read_csv(glob_dir+file) 
    df = df.drop(['Unnamed: 0'], axis=1)
    resampled_df = time_accurate_resampling(df, target_hz, 'time')

    resampled_df.to_csv(glob_dir+'predown/'+file[:-4]+'.csv')
