# Load Library

In [1]:
#Library Data

import pandas as pd
import numpy as np

# Library for filtering
from scipy.signal import butter, filtfilt, lfilter


StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 3, Finished, Available, Finished)

# Load Data

In [2]:
# Load local_data and eeg_data from lakehouse

local_data = pd.read_csv('/lakehouse/default/Files/buangan/local_data.csv')
eeg_data = pd.read_csv('/lakehouse/default/Files/buangan/eeg_data.csv')

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 4, Finished, Available, Finished)

# Data Preprocessing

## Clean From NaN and -Inf Value

EEG data is brain wave data consisting of a series of numbers forming waves, therefore we must ensure that the EEG data we have is free from Null values ​​or -inf infinite values.

In [3]:
# Change -inf value to 0

eeg_data.replace([float('inf'), -float('inf')], 0, inplace=True)

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 5, Finished, Available, Finished)

In [4]:
# Brain wave contains Delta, Theta, Alpha, Beta, and Gamma
eeg_channels = ['Delta_TP9', 'Theta_TP9', 'Alpha_TP9', 'Beta_TP9', 'Gamma_TP9',
                'Delta_AF7', 'Theta_AF7', 'Alpha_AF7', 'Beta_AF7', 'Gamma_AF7',
                'Delta_AF8', 'Theta_AF8', 'Alpha_AF8', 'Beta_AF8', 'Gamma_AF8',
                'Delta_TP10', 'Theta_TP10', 'Alpha_TP10', 'Beta_TP10', 'Gamma_TP10']

# Change NaN value with median data
eeg_data[eeg_channels] = eeg_data[eeg_channels].apply(lambda col: col.fillna(col.median()), axis=0)

# Check fo NaN value
print(eeg_data[eeg_channels].isna().sum())

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 6, Finished, Available, Finished)

Delta_TP9     0
Theta_TP9     0
Alpha_TP9     0
Beta_TP9      0
Gamma_TP9     0
Delta_AF7     0
Theta_AF7     0
Alpha_AF7     0
Beta_AF7      0
Gamma_AF7     0
Delta_AF8     0
Theta_AF8     0
Alpha_AF8     0
Beta_AF8      0
Gamma_AF8     0
Delta_TP10    0
Theta_TP10    0
Alpha_TP10    0
Beta_TP10     0
Gamma_TP10    0
dtype: int64


From the results of this check, we can see that there are no more nan values ​​in the eeg dataset.

## Segmentation Data

Align the EEG data with the decision-making events. Segment the EEG data around each keypress event, creating epochs that start 2 seconds before and end 2 seconds after the keypress.

Before we start segmentation, we need to check Timestamp sequentially

In [5]:
# Convertion Timestamp format to datetim
timestamps = pd.to_datetime(eeg_data['TimeStamp'])

# Check Squentiall time
is_sorted = timestamps.is_monotonic_increasing

if not is_sorted:
    print("Timestamp tidak berurutan. Mengurutkan ulang data berdasarkan timestamp.")
    eeg_data = eeg_data.sort_values('TimeStamp')
else:
    print("Timestamp sudah berurutan.")

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 7, Finished, Available, Finished)

Timestamp tidak berurutan. Mengurutkan ulang data berdasarkan timestamp.


Because keprest event found in local dataset, we need to combine Timestamp from eeg data and keypreess event from local dataset

In [6]:
# Make sure the 'Keypress_Time' column is a numeric type
# If 'Keypress_Time' is a time, we need to convert it to numeric format (seconds)
local_data['Keypress_Time'] = pd.to_numeric(local_data['Keypress_Time'], errors='coerce')

# Take Keypress event and timestamp from dataset
event_times = local_data['Keypress_Time'].values
timestamps = eeg_data['TimeStamp'].values

# Function for YANG segmentation based on event time (Keypress)
def segment_eeg(data, timestamps, event_times, pre_event=2, post_event=2, fs=250):
    epochs = []
    for event_time in event_times:
        # Check Numeric type
        if pd.isna(event_time):
            continue
        
        # Find the timestamp index that is close to event_time
        start_time = event_time - pre_event
        end_time = event_time + post_event
        
        # Find the index that matches the start and end times.
        start_idx = np.argmin(np.abs(timestamps - start_time))
        end_idx = np.argmin(np.abs(timestamps - end_time))
        
        # Take segmentation data for each EEG channel
        epoch = data[start_idx:end_idx]
        epochs.append(epoch)
    
    return np.array(epochs)

# list EEG channel forsegmentation
eeg_channels = ['Delta_TP9', 'Theta_TP9', 'Alpha_TP9', 'Beta_TP9', 'Gamma_TP9', 'RAW_TP9', 'HSI_TP9',
                'Delta_AF7', 'Theta_AF7', 'Alpha_AF7', 'Beta_AF7', 'Gamma_AF7', 'RAW_AF7', 'HSI_AF7',
                'Delta_AF8', 'Theta_AF8', 'Alpha_AF8', 'Beta_AF8', 'Gamma_AF8', 'RAW_AF8', 'HSI_AF8',
                'Delta_TP10', 'Theta_TP10', 'Alpha_TP10', 'Beta_TP10', 'Gamma_TP10', 'RAW_TP10', 'HSI_TP10']

# Segmentation with segmentation function
epochs_dict = {}
for channel in eeg_channels:
    epochs_dict[channel] = segment_eeg(eeg_data[channel].values, timestamps, event_times)

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 8, Finished, Available, Finished)

In [7]:
# Display eeg data

eeg_data

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 9, Finished, Available, Finished)

Unnamed: 0,Delta_TP9,Delta_AF7,Delta_AF8,Delta_TP10,Theta_TP9,Theta_AF7,Theta_AF8,Theta_TP10,Alpha_TP9,Alpha_AF7,...,Gyro_X,Gyro_Y,Gyro_Z,HeadBandOn,HSI_TP9,HSI_AF7,HSI_AF8,HSI_TP10,Battery,TimeStamp
10489644,0.0,0.293835,0.712108,-3.677497,0.0,0.141808,0.594395,-4.046468,0.0,0.534167,...,-13.585358,27.133331,2.310333,1.0,4.0,1.0,1.0,4.0,100.0,2023-04-23 12:21:34.094
10489645,0.0,0.293835,0.712108,-3.677497,0.0,0.141808,0.594395,-4.046468,0.0,0.534167,...,-13.585358,27.133331,2.310333,1.0,4.0,1.0,1.0,4.0,100.0,2023-04-23 12:21:34.096
10489646,0.0,0.293835,0.712108,-3.677497,0.0,0.141808,0.594395,-4.046468,0.0,0.534167,...,-13.585358,27.133331,2.310333,1.0,4.0,1.0,1.0,4.0,100.0,2023-04-23 12:21:34.097
10489647,0.0,0.293835,0.712108,-3.677497,0.0,0.141808,0.594395,-4.046468,0.0,0.534167,...,-13.585358,27.133331,2.310333,1.0,4.0,1.0,1.0,4.0,100.0,2023-04-23 12:21:34.098
10489648,0.0,0.293835,0.712108,-3.677497,0.0,0.141808,0.594395,-4.046468,0.0,0.534167,...,-13.585358,27.133331,2.310333,1.0,4.0,1.0,1.0,4.0,100.0,2023-04-23 12:21:34.099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7247907,0.0,0.756608,0.828615,0.000000,0.0,0.217754,0.622286,0.000000,0.0,0.399838,...,21.458435,4.867401,32.920380,1.0,2.0,1.0,1.0,2.0,100.0,2023-05-22 20:20:51.379
7247909,0.0,0.756608,0.828615,0.000000,0.0,0.217754,0.622286,0.000000,0.0,0.399838,...,21.458435,4.867401,32.920380,1.0,2.0,1.0,1.0,2.0,100.0,2023-05-22 20:20:51.379
7247912,0.0,0.766665,0.840721,0.000000,0.0,0.178980,0.618449,0.000000,0.0,0.346216,...,21.458435,4.867401,32.920380,1.0,2.0,1.0,1.0,2.0,100.0,2023-05-22 20:20:51.380
7247910,0.0,0.756608,0.828615,0.000000,0.0,0.217754,0.622286,0.000000,0.0,0.399838,...,21.458435,4.867401,32.920380,1.0,2.0,1.0,1.0,2.0,100.0,2023-05-22 20:20:51.380


## Filtering data

Apply a bandpass filter to the EEG data to retain frequencies between 1 Hz and
40 Hz. This will remove low-frequency drift and high-frequency noise,
preserving the most relevant EEG signals for decision-making.

In [8]:
# Check All Data before filtering
def check_data(data, column):
    print(f"Checking column: {column}")
    print(f"First few values:\n{data[column].head()}")
    print(f"Data types: {data[column].dtype}")
    print(f"Any NaN values: {data[column].isnull().sum()}")
    print(f"Non-numeric values: {data[column].apply(lambda x: isinstance(x, (int, float))).sum()}")

# Bandpass Filter (1Hz to 40Hz) for EEG channel
def bandpass_filter(data, lowcut=1.0, highcut=49.0, fs=250.0, order=2):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='bandpass')
    y = filtfilt(b, a, data)

    return y

# Chekch data
for column in ['Delta_TP9', 'Theta_TP9', 'Alpha_TP9', 'Beta_TP9', 'Gamma_TP9',
                'Delta_AF7', 'Theta_AF7', 'Alpha_AF7', 'Beta_AF7', 'Gamma_AF7',
                'Delta_AF8', 'Theta_AF8', 'Alpha_AF8', 'Beta_AF8', 'Gamma_AF8',
                'Delta_TP10', 'Theta_TP10', 'Alpha_TP10', 'Beta_TP10', 'Gamma_TP10']:
    check_data(eeg_data, column)

    # Apply filtering
    eeg_data[column] = bandpass_filter(eeg_data[column].values)

    # Cek data after filtering
    print(f"Processed column: {column}, Any NaN values after filtering: {pd.isna(eeg_data[column]).sum()}")


StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 10, Finished, Available, Finished)

Checking column: Delta_TP9
First few values:
10489644    0.0
10489645    0.0
10489646    0.0
10489647    0.0
10489648    0.0
Name: Delta_TP9, dtype: float64
Data types: float64
Any NaN values: 0
Non-numeric values: 11700294
Processed column: Delta_TP9, Any NaN values after filtering: 0
Checking column: Theta_TP9
First few values:
10489644    0.0
10489645    0.0
10489646    0.0
10489647    0.0
10489648    0.0
Name: Theta_TP9, dtype: float64
Data types: float64
Any NaN values: 0
Non-numeric values: 11700294
Processed column: Theta_TP9, Any NaN values after filtering: 0
Checking column: Alpha_TP9
First few values:
10489644    0.0
10489645    0.0
10489646    0.0
10489647    0.0
10489648    0.0
Name: Alpha_TP9, dtype: float64
Data types: float64
Any NaN values: 0
Non-numeric values: 11700294
Processed column: Alpha_TP9, Any NaN values after filtering: 0
Checking column: Beta_TP9
First few values:
10489644    0.0
10489645    0.0
10489646    0.0
10489647    0.0
10489648    0.0
Name: Beta_TP9,

From the information we got from the filtering results, there was no data that was affected by errors or changed to NaN values ​​after filtering was carried out.

In [9]:
# Display Data

display(eeg_data)

StatementMeta(, fa718ec3-e1c3-406d-831d-e93f6c921253, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 32b5aded-7507-40d2-9482-a72dc535dad6)

In [13]:
# Save data

eeg_data.to_csv('/lakehouse/default/Files/buangan/eeg_before_final.csv', index=False)

StatementMeta(, a5eb2a8a-ff89-46d5-af3c-b8b734da1c4f, 15, Finished, Available, Finished)