In [1]:
import pandas as pd 
import glob 
import numpy as np
import os

In [2]:
data_dir = '../data/size_30sec_150ts_stride_03ts/'
data_files = glob.glob(data_dir + '*.csv')
data_files.sort()

In [3]:
len(data_files)

68

In [4]:
full_df_list = []
for data_file in data_files:
    user_df = pd.read_csv(data_file)
    user_id = os.path.basename(data_file).split('.')[0]
    # we only use 0-back, 2-back for our dataset, thus use only label 0 and 2
    user_df = user_df[user_df['label'].isin([0, 2])]
    user_df['subject_id']= user_id
    full_df_list.append(user_df)
full_df = pd.concat(full_df_list)

In [6]:
full_df.to_pickle('size_30sec_150ts_stride_03ts.pkl')

In [14]:
full_df = pd.read_pickle('size_30sec_150ts_stride_03ts.pkl')

In [15]:
# create GLOBAL_ID for each chunk. Thus we can easily identify each chunk
full_df['GLOBAL_ID']= full_df.groupby(['chunk','subject_id']).ngroup()

In [16]:

subject_ids = full_df['subject_id'].unique()
subject_ids.sort()
# Create 17 different buckets by suject_id
buckets = np.array_split(subject_ids, 17)
buckets = [bucket.tolist() for bucket in buckets]
# Create a dictionary to map subject_id to bucket_id
bucket_dict = {}
for i, bucket in enumerate(buckets):
    for subject_id in bucket:
        bucket_dict[subject_id] = i

# Create a new column 'bucket_id' to store the bucket_id for each chunk
full_df['bucket_id'] = full_df['subject_id'].map(bucket_dict)
#full_df['bucket'] = full_df['subject_id'].apply(lambda x: int(x.split('_')[1]) % 17)

In [17]:
full_df

Unnamed: 0,AB_I_O,AB_PHI_O,AB_I_DO,AB_PHI_DO,CD_I_O,CD_PHI_O,CD_I_DO,CD_PHI_DO,chunk,label,subject_id,GLOBAL_ID,bucket_id
0,-0.198291,0.335677,-0.560790,-0.056042,0.460231,0.271119,-0.645892,0.198804,0,0,sub_1,0,0
1,-0.195651,0.297210,-0.580712,-0.032170,0.524553,0.208284,-0.681370,0.202136,0,0,sub_1,0,0
2,-0.192790,0.275041,-0.600446,-0.012370,0.585149,0.162781,-0.716632,0.194758,0,0,sub_1,0,0
3,-0.189943,0.271573,-0.619751,0.002192,0.641092,0.133816,-0.750998,0.176938,0,0,sub_1,0,0
4,-0.187401,0.288043,-0.638379,0.010818,0.691434,0.119983,-0.783748,0.149429,0,0,sub_1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223195,0.540045,0.875518,-0.325848,-0.440688,1.713074,-0.249398,0.060632,-0.672216,1487,2,sub_97,50591,16
223196,0.544018,0.896896,-0.330902,-0.466260,1.713547,-0.275807,0.057288,-0.678784,1487,2,sub_97,50591,16
223197,0.549293,0.903605,-0.336123,-0.489408,1.713248,-0.282847,0.054480,-0.693624,1487,2,sub_97,50591,16
223198,0.556096,0.895520,-0.341600,-0.509439,1.712542,-0.268480,0.052047,-0.716411,1487,2,sub_97,50591,16


In [28]:
# use bucket_id to create new column label "split", which is used to split the dataset into train, val, test
# use 0 - 13 for train, 14-15 for val, 16 for test
full_df['split'] = full_df['bucket_id'].apply(lambda x: 'train' if x < 14 else 'val' if x < 16 else 'test')

In [30]:
full_df[full_df['split']=='val']

Unnamed: 0,AB_I_O,AB_PHI_O,AB_I_DO,AB_PHI_DO,CD_I_O,CD_PHI_O,CD_I_DO,CD_PHI_DO,chunk,label,subject_id,GLOBAL_ID,bucket_id,split
0,-0.254009,0.323834,0.163370,0.173541,-1.035942,-0.503407,-0.046113,0.077430,0,0,sub_81,56,14,val
1,-0.254776,0.368412,0.163123,0.161432,-1.058797,-0.518247,-0.040133,0.092480,0,0,sub_81,56,14,val
2,-0.256039,0.388482,0.162104,0.151333,-1.070506,-0.527267,-0.033105,0.102594,0,0,sub_81,56,14,val
3,-0.257520,0.384040,0.160270,0.143349,-1.070891,-0.528275,-0.025085,0.107280,0,0,sub_81,56,14,val
4,-0.258872,0.356346,0.157577,0.137418,-1.060096,-0.519615,-0.016195,0.106354,0,0,sub_81,56,14,val
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223195,1.248246,-0.016309,-0.481662,-0.333168,0.792752,0.307972,-0.235426,-0.782164,1487,2,sub_92,50587,15,val
223196,1.247319,-0.050478,-0.487126,-0.338037,0.788060,0.280145,-0.231095,-0.766535,1487,2,sub_92,50587,15,val
223197,1.246066,-0.083841,-0.491427,-0.342444,0.783848,0.252175,-0.226277,-0.749189,1487,2,sub_92,50587,15,val
223198,1.244989,-0.115686,-0.494529,-0.346336,0.780280,0.226762,-0.221046,-0.731499,1487,2,sub_92,50587,15,val


In [31]:
train_df = full_df[full_df['split'].isin(['train', 'val'])]
test_df = full_df[full_df['split']=='test']

In [33]:
# separate train_df and test_df to feature part and label part
train_df_x = train_df.drop(['label', 'chunk', 'subject_id', 'bucket_id', 'split'], axis=1)
train_df_y = train_df[['label','GLOBAL_ID','chunk', 'subject_id', 'bucket_id', 'split']]
test_df_x = test_df.drop(['label', 'chunk', 'subject_id', 'bucket_id', 'split'], axis=1)
test_df_y = test_df[['label','GLOBAL_ID','chunk', 'subject_id', 'bucket_id', 'split']]

In [38]:
train_df_y = train_df_y.groupby(['GLOBAL_ID']).head(1)
test_df_y = test_df_y.groupby(['GLOBAL_ID']).head(1)

In [41]:
train_df_x.to_pickle('../data/fnirs_size_30sec_150ts_stride_03ts/train_x.pkl')
train_df_y.to_pickle('../data/fnirs_size_30sec_150ts_stride_03ts/train_y.pkl')
test_df_x.to_pickle('../data/fnirs_size_30sec_150ts_stride_03ts/test_x.pkl')
test_df_y.to_pickle('../data/fnirs_size_30sec_150ts_stride_03ts/test_y.pkl')