# Data Preparation

Clean the raw EEG data

In [29]:
#Libraries
import pandas as pd
import numpy as np

from aux_functions import timeseries_to_pandas
from aux_functions import pandas_to_numpy

import pickle
import os

### Alcoholism_S1

In [5]:
#Read the csv files and concatenate them into a single dataframe
df = pd.DataFrame()

for folder in ['SMNI_CMI_TRAIN', 'SMNI_CMI_TEST']:
    files = os.listdir(os.path.join('Datasets_raw', 'Alcoholism_S1', folder))
    files = [file for file in files if file.endswith('.csv')]
    
    for file in files:
        data = pd.read_csv(os.path.join('Datasets_raw', 'Alcoholism_S1', folder, file))
        df = pd.concat([df, data])
        
df = df.sort_values(['name', 'trial number', 'channel', 'sample num'])
df = df[['name', 'subject identifier', 'matching condition', 'trial number',
         'channel', 'sample num', 'sensor value']]

#Limit the trials only to those where the matching condition is S1
df = df[df['matching condition'] == 'S1 obj']

#Create a dictionary to store, for each subject, a numpy array of all his trials 
#in the format (n_trials, n_channels, n_timesteps), and a numpy array with the 
#identifier of the subject ('c' for control, 'a' for alcoholic) for each
#of his trials

subjects = {}

for subject in df.name.unique():
    subjects[subject] = {}
    trials = []
    subject_data = df[df.name == subject]
    
    for trial in subject_data['trial number'].unique():
        channels = []
        trial_data = subject_data[subject_data['trial number'] == trial]
    
        for channel in trial_data['channel'].unique():
            channel_data = list(trial_data.loc[trial_data['channel'] == channel, 'sensor value'])
            channels.append(channel_data)
            
        trials.append(channels)
        
    subjects[subject]['X'] = np.array(trials)
    subjects[subject]['y'] = np.array(subject_data[['subject identifier', 'trial number']]\
                                      .drop_duplicates()['subject identifier'])

#Create folder structure and save the clean data
if not os.path.exists(os.path.join('Datasets_clean', 'Alcoholism_S1')):
    os.makedirs(os.path.join('Datasets_clean', 'Alcoholism_S1'))

with open(os.path.join('Datasets_clean', 'Alcoholism_S1', 'clean_data.pkl'), 'wb') as handle:
    pickle.dump(subjects, handle)

### FingerMovements

In [31]:
#Read data
data_train = {}
data_test = {}

for i in range(1,29):
    data_train[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'FingerMovements',
                                               f'FingerMovementsDimension{i}_TRAIN.arff'))[0]
    data_test[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'FingerMovements',
                                               f'FingerMovementsDimension{i}_TEST.arff'))[0]

#Convert to pandas dataframes
df_train = timeseries_to_pandas(data_train)
df_test = timeseries_to_pandas(data_test)

#Convert to numpy arrays
X_train, y_train = pandas_to_numpy(df_train)
X_test, y_test = pandas_to_numpy(df_test)

#Create folder structure and save the arrays
if not os.path.exists(os.path.join('Datasets_clean', 'FingerMovements')):
    os.makedirs(os.path.join('Datasets_clean', 'FingerMovements'))

np.save('Datasets_clean/FingerMovements/X_train', X_train)
np.save('Datasets_clean/FingerMovements/y_train', y_train)
np.save('Datasets_clean/FingerMovements/X_test', X_test)
np.save('Datasets_clean/FingerMovements/y_test', y_test)

### SelfregulationSCP1

In [32]:
#Read data
data_train = {}
data_test = {}

for i in range(1,7):
    data_train[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'SelfRegulationSCP1',
                                               f'SelfRegulationSCP1Dimension{i}_TRAIN.arff'))[0]
    data_test[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'SelfRegulationSCP1',
                                               f'SelfRegulationSCP1Dimension{i}_TEST.arff'))[0]
    
#Convert to pandas dataframes
df_train = timeseries_to_pandas(data_train)
df_test = timeseries_to_pandas(data_test)

#Convert to numpy arrays
X_train, y_train = pandas_to_numpy(df_train)
X_test, y_test = pandas_to_numpy(df_test)

#Create folder structure and save the arrays
if not os.path.exists(os.path.join('Datasets_clean', 'SelfRegulationSCP1')):
    os.makedirs(os.path.join('Datasets_clean', 'SelfRegulationSCP1'))

np.save('Datasets_clean/SelfRegulationSCP1/X_train', X_train)
np.save('Datasets_clean/SelfRegulationSCP1/y_train', y_train)
np.save('Datasets_clean/SelfRegulationSCP1/X_test', X_test)
np.save('Datasets_clean/SelfRegulationSCP1/y_test', y_test)

### DEAP

In [33]:
#Storing variables
full_array = np.empty((0,33,8064))
full_valence = np.empty(0)
full_arousal = np.empty(0)
full_liking = np.empty(0)

#Read data
for file in os.listdir("./Datasets_raw/DEAP/data_preprocessed_python"):
    if file.endswith(".dat"):
        data = pickle.load(open('Datasets_raw/DEAP/data_preprocessed_python/{}'.format(file), 'rb'), encoding='latin1')
        
        #EEG timeseries
        all_timeseries = data['data'].copy()
        eeg_timeseries = all_timeseries[:,0:33,:]
        full_array = np.vstack((full_array,eeg_timeseries))
        
        #Response variables
        valence = data['labels'][:,0]
        valence = np.digitize(valence,np.array([0,5]))
        full_valence = np.hstack((full_valence,valence))

        arousal = data['labels'][:,1]
        arousal = np.digitize(arousal,np.array([0,5]))
        full_arousal = np.hstack((full_arousal,arousal))
        
        liking = data['labels'][:,3]
        liking = np.digitize(liking,np.array([0,5]))
        full_liking = np.hstack((full_liking,valence))

#0-1 response variables
full_valence[full_valence == 1] = 0
full_valence[full_valence == 2] = 1
full_arousal[full_arousal == 1] = 0
full_arousal[full_arousal == 2] = 1
full_liking[full_liking == 1] = 0
full_liking[full_liking == 2] = 1

#Saving the numpy files
X_train = full_array[0:960].copy()
X_test = full_array[960:].copy()

#Create folder structure and save the arrays
if not os.path.exists(os.path.join('Datasets_clean', 'DEAP_Valence')):
    os.makedirs(os.path.join('Datasets_clean', 'DEAP_Valence'))
if not os.path.exists(os.path.join('Datasets_clean', 'DEAP_Arousal')):
    os.makedirs(os.path.join('Datasets_clean', 'DEAP_Arousal'))
if not os.path.exists(os.path.join('Datasets_clean', 'DEAP_Liking')):
    os.makedirs(os.path.join('Datasets_clean', 'DEAP_Liking'))

np.save('Datasets_clean/DEAP_Valence/X_train.npy', X_train)
np.save('Datasets_clean/DEAP_Arousal/X_train.npy', X_train)
np.save('Datasets_clean/DEAP_Liking/X_train.npy', X_train)

np.save('Datasets_clean/DEAP_Valence/X_test.npy', X_test)
np.save('Datasets_clean/DEAP_Arousal/X_test.npy', X_test)
np.save('Datasets_clean/DEAP_Liking/X_test.npy', X_test)

np.save('Datasets_clean/DEAP_Valence/y_train.npy', full_valence[0:960])
np.save('Datasets_clean/DEAP_Arousal/y_train.npy', full_arousal[0:960])
np.save('Datasets_clean/DEAP_Liking/y_train.npy', full_liking[0:960])

np.save('Datasets_clean/DEAP_Valence/y_test.npy', full_valence[960:])
np.save('Datasets_clean/DEAP_Arousal/y_test.npy', full_arousal[960:])
np.save('Datasets_clean/DEAP_Liking/y_test.npy', full_liking[960:])