<a href="https://colab.research.google.com/github/karyam/rgnn_eeg_emotion_classifier/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os, sys
import tensorflow as tf
import tensorboard
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.io as io
import pickle

In [None]:
#globals
FEATURES_DATA_PATH = "drive/My Drive/BCI/code/data/SEED/ExtractedFeatures"
PREPROCESSED_DATA_PATH = "drive/My Drive/BCI/code/data/SEED/Preprocessed_EEG"
CSV_PATH = "drive/My Drive/BCI_clone/CSV"
NPY_PATH = "drive/My Drive/BCI_clone/npy"
SUBJECT_CSV_PATH = CSV_PATH + "/" + "csv_0.csv"
ALL_SUBJECTS_CSV_PATH = CSV_PATH + "/" + "csv_all_subjects.csv"

# sampling frequency
sf = 200

# hyper-parameters
num_trials = 15
num_subjects = 15
num_bands = 5
num_classes = 3
batch_size = 32

#### Examine data format

In [None]:
# get only subject data and sort for convenience
data = os.listdir(FEATURES_DATA_PATH)
# only take subject files
data = [x for x in data if len(x.split("_")) == 2] 
data.sort(key = lambda x : int(x.split("_")[0]))
# 3 files per subject, each file contains recordings for 15 trials
assert (len(data) == 45)

# load one sample 
sample = io.loadmat(os.path.join(FEATURES_DATA_PATH, data[0]))

keys = list(sample.keys())
assert (len(keys) == (2*6*15+3)) # 3 meta keys
print("One sample shape: (num_channels, num_windows, num_bands)")
print(sample["de_LDS1"].shape)

# get all features averaged with LDS
features_LDS = keys[4::2]
print("Feature names LDS averaged")
print(features_LDS)
assert (len(features_LDS) == (15*6))

labels = io.loadmat(os.path.join(FEATURES_DATA_PATH, "label.mat"))
labels = np.squeeze(labels["label"] + np.ones(15, dtype=np.int8))
assert (labels.shape == (15,))
print(type(labels[0]))
print(labels)

One sample shape: (num_channels, num_windows, num_bands)
(62, 235, 5)
Feature names LDS averaged
['de_LDS1', 'psd_LDS1', 'dasm_LDS1', 'rasm_LDS1', 'asm_LDS1', 'dcau_LDS1', 'de_LDS2', 'psd_LDS2', 'dasm_LDS2', 'rasm_LDS2', 'asm_LDS2', 'dcau_LDS2', 'de_LDS3', 'psd_LDS3', 'dasm_LDS3', 'rasm_LDS3', 'asm_LDS3', 'dcau_LDS3', 'de_LDS4', 'psd_LDS4', 'dasm_LDS4', 'rasm_LDS4', 'asm_LDS4', 'dcau_LDS4', 'de_LDS5', 'psd_LDS5', 'dasm_LDS5', 'rasm_LDS5', 'asm_LDS5', 'dcau_LDS5', 'de_LDS6', 'psd_LDS6', 'dasm_LDS6', 'rasm_LDS6', 'asm_LDS6', 'dcau_LDS6', 'de_LDS7', 'psd_LDS7', 'dasm_LDS7', 'rasm_LDS7', 'asm_LDS7', 'dcau_LDS7', 'de_LDS8', 'psd_LDS8', 'dasm_LDS8', 'rasm_LDS8', 'asm_LDS8', 'dcau_LDS8', 'de_LDS9', 'psd_LDS9', 'dasm_LDS9', 'rasm_LDS9', 'asm_LDS9', 'dcau_LDS9', 'de_LDS10', 'psd_LDS10', 'dasm_LDS10', 'rasm_LDS10', 'asm_LDS10', 'dcau_LDS10', 'de_LDS11', 'psd_LDS11', 'dasm_LDS11', 'rasm_LDS11', 'asm_LDS11', 'dcau_LDS11', 'de_LDS12', 'psd_LDS12', 'dasm_LDS12', 'rasm_LDS12', 'asm_LDS12', 'dcau_LDS1

In [None]:
# get the range of values across samples from de_LDS feature
max_value = -1e18
min_value = 1e18
de_max_value = -1e18
de_min_value = 1e18

sample = io.loadmat(os.path.join(FEATURES_DATA_PATH, data[0]))
# get all the de_lds feature keys since for the final model I will only use de_lds features
de_keys = [key for key in sample.keys() if "de_LDS" in key]
assert (len(de_keys) == 15)

for sample in data:
  sample = io.loadmat(os.path.join(FEATURES_DATA_PATH, sample))
  for key in features_LDS:
    if key in de_keys:
      de_max_value = max(de_max_value, np.amax(sample[key]))
      de_min_value = min(de_min_value, np.amin(sample[key]))
    max_value = max(max_value, np.amax(sample[key]))
    min_value = min(min_value, np.amin(sample[key]))
print((min_value, max_value))
print(f'de range: {(de_min_value, de_max_value)}')

(-23.825413747036478, 1072646499245.6045)
de range: (10.567626836074302, 42.11366999020901)


#### Reformat to csv files for convenience and using tfdv 

In [None]:
# dataframe format: each row represents one data sample (one window from trial)
# columns: all the provided features with LDS averaging for each channel and for each band
features=['de_LDS','psd_LDS','dasm_LDS','rasm_LDS','asm_LDS','dcau_LDS']
num_ch_per_feature = [62, 62, 27, 27, 54, 23]
bands = ["delta", "theta", "alpha", "beta", "gamma"]
columns = []
count = 0

for i, f in enumerate(features):
  for ch in range(1, num_ch_per_feature[i]+1):
    for b in bands:
      columns.append(str(f + "_" + str(ch) + "_" + b))

 
# add column for labels
columns.append("label")
assert (len(columns) == (sum(i*5 for i in num_ch_per_feature)+1))

# number of features 62*5 + 62*5 + 27*5 + 27*5 + 54*5 + 23*5 = 1275.
total_num_features = len(columns)

In [None]:
def get_data_from_file(file:str):
  x = io.loadmat(os.path.join(FEATURES_DATA_PATH, file))
  trial_data = [0] * 15
  total_num_wind = 0

  for i in range(0,90,6): # process each trial data: 90 = 15*6
    trial = int(i//6) #0-indexed
    f = [0] * 6
    feature_name = features_LDS[i]
    assert (feature_name == f"de_LDS{trial+1}")
    num_wind_trial = x[feature_name].shape[1]
    total_num_wind += num_wind_trial
  
    # process the entire set of 6 features
    for j in range(6):
      f[j] = x[features_LDS[i+j]]
      f[j] = np.swapaxes(f[j],0,1) # swap the ch and num_wind axis
      # concatenate all data points across channels
      f[j] = np.reshape(f[j], (-1, num_ch_per_feature[j]*num_bands))
      assert (f[j].shape == (num_wind_trial, 5*num_ch_per_feature[j]))

      if j == 0: # de feature
        assert (np.amax(f[j]) <= de_max_value)
        assert (np.amin(f[j]) >= de_min_value)

    # assign to each window the corresponding trial label
    l = list([labels[trial]] * num_wind_trial)
    trial_labels = np.array([l])
    trial_labels = np.reshape(trial_labels, (num_wind_trial, -1))
    assert (np.unique(trial_labels).shape == (1,))
    assert (trial_labels[0] == labels[trial])

    # concatenate features across the horizontal axis: # feature_ch_band order + an additional column for labels
    trial_data[trial] = np.concatenate([f[idx] for idx in range(6)] + [trial_labels], 1) 
    assert (trial_data[trial].shape == (num_wind_trial, total_num_features))
  
  entire_file_data = np.concatenate([trial_data[idx] for idx in range(num_trials)], axis=0)
  assert entire_file_data.shape == (total_num_wind, total_num_features)
  assert (np.amax(entire_file_data) <= max_value)
  assert (np.amin(entire_file_data) >= min_value)
  
  return entire_file_data

In [None]:
def get_all_data_for_subject(file1, file2, file3):
  series1 = get_data_from_file(file1)
  series2 = get_data_from_file(file2)
  series3 = get_data_from_file(file3)
  series = np.concatenate([series1, series2, series3], axis=0)
  assert (series.shape == (3*series1.shape[0], total_num_features))
  assert (np.amax(series) <= max_value)
  assert (np.amin(series) >= min_value)
  return series

In [None]:
all_dfs = []
for i in range(0,45,3):
  # get all data for each subject
  subject_data = get_all_data_for_subject(data[i], data[i+1], data[i+2])
  #convert numpy data to DataFrame
  data_dict = {}
  for j, col in enumerate(columns):
    data_dict[col] = subject_data[:, j]
  df = pd.DataFrame(data_dict)
  all_dfs.append(df)
  #save dataframe to csv
  df.to_csv(CSV_PATH + "/" + "csv_" + str((i+1)//3) + ".csv")
  
all_subjects_df = pd.concat(all_dfs)
all_subjects_df.to_csv(CSV_PATH + "/" + "csv_all_subjects.csv")

#### Concatenate more windows in one training sample
*The above blocks treat a data sample as a single 1s window, however in order to effectively classify the input more time steps (1s windows) need to be taken into consideration.*

In [None]:
def get_wind_from_file(file:str, w_len, drop_incomplete=True):
  x = io.loadmat(os.path.join(FEATURES_DATA_PATH, file))
  all_data, all_labels = [], []
  total_num_wind = 0

  for i in range(0,90,6):
    trial = int(i//6) #0-indexed
    trial_data = []

    # take only de_lds feature
    assert (features_LDS[i] == f"de_LDS{trial+1}")
    f = x[features_LDS[i]]
    num_wind_trial = f.shape[1]
    
    for j in range(0, num_wind_trial, w_len): # concat w_len samples
      if drop_incomplete is True and (j + w_len > num_wind_trial): break;
      window = f[:, j:j+w_len, :]
      assert (window.shape == (62, w_len, 5))
      window = np.reshape(window, (62, -1))
      assert (window.shape == (62, w_len*5))
      trial_data.append(window)
    
    trial_data = np.stack(trial_data, axis=0)
    num_wind_trial = trial_data.shape[0]
    total_num_wind += num_wind_trial

    assert (trial_data.shape == (num_wind_trial, 62, w_len*num_bands))
    assert (np.amax(trial_data) <= de_max_value)
    assert (np.amin(trial_data) >= de_min_value)

    # assign to each window the corresponding trial label
    trial_labels = np.array(list([labels[trial]] * trial_data.shape[0]))
    assert (np.unique(trial_labels).shape == (1,))
    assert (trial_labels[0] == labels[trial])

    all_data.append(trial_data)
    all_labels.append(trial_labels)
    
  all_data = np.concatenate(all_data, axis=0)
  all_labels = np.concatenate(all_labels, axis=0)
  assert (all_data.shape == (total_num_wind, 62, num_bands*w_len))
  assert (all_labels.shape == (total_num_wind,))
  assert (np.amax(all_data) <= de_max_value)
  assert (np.amin(all_data) >= de_min_value)
  return all_data, all_labels


In [None]:
def get_wind_for_subject(file1, file2, file3, w_len):
  series1, l1 = get_wind_from_file(file1, w_len)
  series2, l2 = get_wind_from_file(file2, w_len)
  series3, l3 = get_wind_from_file(file3, w_len)

  series = np.concatenate([series1, series2, series3], axis=0)
  l = np.concatenate([l1, l2, l3], axis=0)
  assert (series.shape == (3*series1.shape[0], 62, w_len*num_bands))
  assert (l.shape == (3*series1.shape[0],))
  assert (np.amax(series) <= de_max_value)
  assert (np.amin(series) >= de_min_value)
  return series, l

In [None]:
def get_more_windows_data(w_len):
  all_data, all_labels = [], []
  for i in range(0,45,3):
    subject_data, subject_labels = get_wind_for_subject(data[i], data[i+1], data[i+2], w_len)
    with open(NPY_PATH + "/" + "npy_" + str((i+1)//3) + ".npy", 'wb') as f:
      np.save(f, subject_data)
    with open(NPY_PATH + "/" + "npy_" + str((i+1)//3) + "_label.npy", 'wb') as f:
      np.save(f, subject_labels)
    all_data.append(subject_data)
    all_labels.append(subject_labels)
  
  all_data = np.concatenate(all_data, axis=0)
  all_labels = np.concatenate(all_labels, axis=0)
  assert (all_data.shape[0] == all_labels.shape[0])
  with open(NPY_PATH + "/" + "npy_all_subjects.npy", 'wb') as f:
      np.save(f, all_data)
  with open(NPY_PATH + "/" + "npy_all_subjects_label.npy", 'wb') as f:
      np.save(f, all_labels)


In [None]:
get_more_windows_data(5)