# Data Preparation

Clean the raw EEG data

In [2]:
#Libraries
import pandas as pd
import numpy as np
from scipy.io import arff

from aux_functions import timeseries_to_pandas
from aux_functions import pandas_to_numpy

import pickle
import os

### Alcoholism_S1

In [2]:
#Read the csv files and concatenate them into a single dataframe
df = pd.DataFrame()

for folder in ['SMNI_CMI_TRAIN', 'SMNI_CMI_TEST']:
    files = os.listdir(os.path.join('Datasets_raw', 'Alcoholism_S1', folder))
    files = [file for file in files if file.endswith('.csv')]
    
    for file in files:
        data = pd.read_csv(os.path.join('Datasets_raw', 'Alcoholism_S1', folder, file))
        df = pd.concat([df, data])
        
df = df.sort_values(['name', 'trial number', 'channel', 'sample num'])
df = df[['name', 'subject identifier', 'matching condition', 'trial number',
         'channel', 'sample num', 'sensor value']]

#Limit the trials only to those where the matching condition is S1
df = df[df['matching condition'] == 'S1 obj']

#Create a dictionary to store, for each subject, a numpy array of all his trials 
#in the format (n_trials, n_channels, n_timesteps), and a numpy array with the 
#identifier of the subject ('c' for control, 'a' for alcoholic) for each
#of his trials

subjects = {}

for subject in df.name.unique():
    subjects[subject] = {}
    trials = []
    subject_data = df[df.name == subject]
    
    for trial in subject_data['trial number'].unique():
        channels = []
        trial_data = subject_data[subject_data['trial number'] == trial]
    
        for channel in trial_data['channel'].unique():
            channel_data = list(trial_data.loc[trial_data['channel'] == channel, 'sensor value'])
            channels.append(channel_data)
            
        trials.append(channels)
        
    subjects[subject]['X'] = np.array(trials)
    subjects[subject]['y'] = np.array(subject_data[['subject identifier', 'trial number']]\
                                      .drop_duplicates()['subject identifier'])

#Create folder structure and save the clean data
if not os.path.exists(os.path.join('Datasets_clean', 'Alcoholism_S1')):
    os.makedirs(os.path.join('Datasets_clean', 'Alcoholism_S1'))

with open(os.path.join('Datasets_clean', 'Alcoholism_S1', 'clean_data.pkl'), 'wb') as handle:
    pickle.dump(subjects, handle)

### FingerMovements

In [5]:
#Read data
data_train = {}
data_test = {}

for i in range(1,29):
    data_train[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'FingerMovements',
                                               f'FingerMovementsDimension{i}_TRAIN.arff'))[0]
    data_test[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'FingerMovements',
                                               f'FingerMovementsDimension{i}_TEST.arff'))[0]

#Convert to pandas dataframes
df_train = timeseries_to_pandas(data_train)
df_test = timeseries_to_pandas(data_test)

#Convert to numpy arrays
X_train, y_train = pandas_to_numpy(df_train)
X_test, y_test = pandas_to_numpy(df_test)

#Create folder structure and save the arrays
if not os.path.exists(os.path.join('Datasets_clean', 'FingerMovements')):
    os.makedirs(os.path.join('Datasets_clean', 'FingerMovements'))

np.save('Datasets_clean/FingerMovements/X_train', X_train)
np.save('Datasets_clean/FingerMovements/y_train', y_train)
np.save('Datasets_clean/FingerMovements/X_test', X_test)
np.save('Datasets_clean/FingerMovements/y_test', y_test)

### SelfregulationSCP1

In [6]:
#Read data
data_train = {}
data_test = {}

for i in range(1,7):
    data_train[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'SelfRegulationSCP1',
                                               f'SelfRegulationSCP1Dimension{i}_TRAIN.arff'))[0]
    data_test[i] = arff.loadarff(os.path.join('Datasets_raw', 
                                               'SelfRegulationSCP1',
                                               f'SelfRegulationSCP1Dimension{i}_TEST.arff'))[0]
    
#Convert to pandas dataframes
df_train = timeseries_to_pandas(data_train)
df_test = timeseries_to_pandas(data_test)

#Convert to numpy arrays
X_train, y_train = pandas_to_numpy(df_train)
X_test, y_test = pandas_to_numpy(df_test)

#Create folder structure and save the arrays
if not os.path.exists(os.path.join('Datasets_clean', 'SelfRegulationSCP1')):
    os.makedirs(os.path.join('Datasets_clean', 'SelfRegulationSCP1'))

np.save('Datasets_clean/SelfRegulationSCP1/X_train', X_train)
np.save('Datasets_clean/SelfRegulationSCP1/y_train', y_train)
np.save('Datasets_clean/SelfRegulationSCP1/X_test', X_test)
np.save('Datasets_clean/SelfRegulationSCP1/y_test', y_test)

### DEAP

In [32]:
#Create a dictionary to store, for each subject, a numpy array of all his trials 
#in the format (n_trials, n_channels, n_timesteps), and a numpy array with the 
#identifier of the subject ('c' for control, 'a' for alcoholic) for each
#of his trials
subjects = {}
i = 1

for file in os.listdir("./Datasets_raw/DEAP/data_preprocessed_python"):
    if file.endswith(".dat"):
        data = pickle.load(open('Datasets_raw/DEAP/data_preprocessed_python/{}'.format(file), 'rb'), encoding='latin1')
        
        #EEG timeseries
        all_timeseries = data['data'].copy()
        eeg_timeseries = all_timeseries[:,0:32,:]
        
        #Response variables
        valence = data['labels'][:,0]
        valence = np.digitize(valence,np.array([0,5]))
        valence[valence == 1] = 0
        valence[valence == 2] = 1
        
        arousal = data['labels'][:,1]
        arousal = np.digitize(arousal,np.array([0,5]))
        arousal[arousal == 1] = 0
        arousal[arousal == 2] = 1
        
        dominance = data['labels'][:,2]
        dominance = np.digitize(dominance,np.array([0,5]))
        dominance[dominance == 1] = 0
        dominance[dominance == 2] = 1
        
        liking = data['labels'][:,3]
        liking = np.digitize(liking,np.array([0,5]))
        liking[liking == 1] = 0
        liking[liking == 2] = 1
        
        #Save the subject dadta
        subjects[f'subject_{i}'] = {}
        subjects[f'subject_{i}']['X'] = eeg_timeseries
        subjects[f'subject_{i}']['y_valence'] = valence
        subjects[f'subject_{i}']['y_arousal'] = arousal
        subjects[f'subject_{i}']['y_dominance'] = dominance
        subjects[f'subject_{i}']['y_liking'] = liking

        i = i+1
        
#Create folder structure and save the clean data
if not os.path.exists(os.path.join('Datasets_clean', 'DEAP')):
    os.makedirs(os.path.join('Datasets_clean', 'DEAP'))

with open(os.path.join('Datasets_clean', 'DEAP', 'clean_data.pkl'), 'wb') as handle:
    pickle.dump(subjects, handle)