In [None]:
import scipy.io
import os
import numpy as np
import pandas as pd 

print("scipy version: ", scipy.__version__)
print("numpy version: ", np.__version__)
print("pandas version: ", pd.__version__)

In [396]:
wkdirPath = 'C:\\Users\\kevin\\Downloads\\ds002723\\'
dataRawFolder = 'ds002723\\'
dataPreprocessedFolder = 'preprocessed\\'
dataSegmentedFolder = 'segmented\\'
datasetFolder = 'dataset\\'

inputFolder = 'X\\'
outputFolder = 'y\\'


inputSegmentedPath = wkdirPath + dataSegmentedFolder + inputFolder
outputSegmentedPath = wkdirPath + dataSegmentedFolder + outputFolder

datasetPath = wkdirPath + datasetFolder

In [None]:
inputMatFileList = [file for file in os.listdir(inputSegmentedPath) if file.lower().endswith(".mat")]

In [None]:
data = dict()

In [None]:
tempStorage = None
matFileList = [file for file in os.listdir(inputSegmentedPath) if file.lower().endswith(".mat")]
for matFile in matFileList:
    mat = scipy.io.loadmat(inputSegmentedPath + matFile)
    
    dataTemp = mat['EEGData'].transpose(2,0,1)

    if tempStorage is None:
        tempStorage = dataTemp
    else:
        tempStorage = np.append(tempStorage, dataTemp, axis=0)

assert tempStorage is not None, 'Data is empty'
data['x'] = tempStorage

In [None]:
print(data['x'].shape)
print(data['x'].dtype)

In [None]:
outputMatFileList = [file for file in os.listdir(outputSegmentedPath) if file.lower().endswith(".mat")]

In [None]:
tempStorage = None
matFileList = [file for file in os.listdir(outputSegmentedPath) if file.lower().endswith(".mat")]
for matFile in matFileList:
    mat = scipy.io.loadmat(outputSegmentedPath + matFile)
    
    dataTemp = mat['EEGEventType'].transpose(1,0)

    if tempStorage is None:
        tempStorage = dataTemp
    else:
        tempStorage = np.append(tempStorage, dataTemp, axis=0)

assert tempStorage is not None, 'Data is empty'
data['y'] = tempStorage

In [None]:
print(data['y'].shape)
print(data['y'].dtype)

In [None]:
np.save(datasetPath + 'X.npy', data['x'])
np.save(datasetPath + 'y.npy', data['y'])

# **Data Feature Extract**

In [None]:
import mne_features as mnef 
import mne
import sklearn
from sklearn.model_selection import train_test_split

print("mne_features version: ", mnef.__version__)
print("mne version: ", mne.__version__)
print("scikit-learn version: ", sklearn.__version__)


In [None]:
channels = pd.read_csv(wkdirPath + dataSegmentedFolder + 'channelData.csv')
channels = tuple(channels.keys())

In [None]:
X = np.load(datasetPath + 'X.npy')
y = np.load(datasetPath + 'y.npy')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, shuffle=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

X_test_pd = pd.DataFrame(X_test, columns=channels)
y_train_pd = pd.DataFrame(y_train, columns=channels)

In [189]:
function_uni_opt = mnef.feature_extraction.get_univariate_func_names()
function_bi_opt = mnef.feature_extraction.get_bivariate_func_names()
print(function_uni_opt)
print(function_bi_opt)

['app_entropy', 'decorr_time', 'energy_freq_bands', 'higuchi_fd', 'hjorth_complexity', 'hjorth_complexity_spect', 'hjorth_mobility', 'hjorth_mobility_spect', 'hurst_exp', 'katz_fd', 'kurtosis', 'line_length', 'mean', 'pow_freq_bands', 'ptp_amp', 'quantile', 'rms', 'samp_entropy', 'skewness', 'spect_edge_freq', 'spect_entropy', 'spect_slope', 'std', 'svd_entropy', 'svd_fisher_info', 'teager_kaiser_energy', 'variance', 'wavelet_coef_energy', 'zero_crossings']
['max_cross_corr', 'nonlin_interdep', 'phase_lock_val', 'spect_corr', 'time_corr']


In [None]:
fe = mnef.feature_extraction.FeatureExtractor(sfreq=1000, selected_funcs='all', selected_funcs_params=None, n_jobs=1)

In [None]:
X_train.shape

In [389]:
import numpy as np
rng = np.random.RandomState(42)

n_epochs, n_channels, n_times = X_train.shape
selected_funcs=['quantile', 'rms', 'std', 'zero_crossings', 'ptp_amp', 'variance']

def extractFeatures(X_data:np.ndarray, selected_funcs:tuple, ch_names:tuple):
    print(selected_funcs)
    print(ch_names)
    n_jobs=1
    return mnef.feature_extraction.extract_features(X_data, sfreq=1000, selected_funcs=selected_funcs, n_jobs=n_jobs, ch_names=ch_names, return_as_df=True)

X_train_fe_df = extractFeatures(X_train, selected_funcs, channels)

['quantile', 'rms', 'std', 'zero_crossings', 'ptp_amp', 'variance']
('FP1', 'FPz', 'FP2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FT9', 'FC5', 'FC1', 'FC2', 'FC6', 'FT10', 'T7', 'C3', 'Cz', 'C4', 'T8', 'TP9', 'CP5', 'CP1', 'CP2', 'CP6', 'TP10', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'O2')


KeyboardInterrupt: 

In [308]:
channels_to_id = dict((channels[i],i) for i in range(len(channels)))
id_to_channels = dict((i,channels[i]) for i in range(len(channels)))

In [318]:
import numpy as np
rng = np.random.RandomState(42)
n_epochs, n_channels, n_times = X_train.shape

selected_funcs=['quantile', 'rms', 'std', 'zero_crossings', 'ptp_amp', 'variance']
funcs_params=None
n_jobs=1
ch_names= channels

# fe = mnef.feature_extraction.FeatureExtractor(sfreq=1000, selected_funcs=selected_funcs, n_jobs=n_jobs)
# X_train_fe = fe.fit_transform(X_train)

X_train_fe_frontal_df = extractFeatures(X_train[:,:14,:], selected_funcs=selected_funcs, ch_names=ch_names[:14])

In [379]:
print(X_train_fe_df.shape)
fe_index_label = tuple(X_train_fe_df.keys())
print(len(fe_index_label))

(746, 864)
864


In [378]:
print(X_train_fe_df_14.shape)
fe_frontal_index_label = tuple(X_train_fe_df_14.keys())
print(len(fe_frontal_index_label))

(746, 84)
84


### (extra) Features -> Poly

In [360]:
keys = tuple(i+'-'+j for i,j in list(X_train_fe_df_14.keys()))

In [362]:
from sklearn.preprocessing import PolynomialFeatures
ploy = PolynomialFeatures(degree=2, interaction_only=False)


X_train_poly = ploy.fit_transform(X_train_fe_df_14)
# Restructure the data into dataframe with column names
X_train_poly = pd.DataFrame(X_train_poly, columns=ploy.get_feature_names_out(keys))

In [363]:
id_to_label = dict((i,fe_index_label[i]) for i in range(len(fe_index_label)))
label_to_id = dict((fe_index_label[i],i) for i in range(len(fe_index_label)))
print(len(label_to_id)) 

84


In [377]:
print(X_train_poly.shape)
fe_poly_index_label = tuple(X_train_poly.keys())
print(len(fe_poly_index_label))

(746, 3655)
3655


## **Feature Selection**

In [196]:
from sklearn.pipeline import Pipeline   
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression, PassiveAggressiveClassifier, Perceptron, BayesianRidge, Lasso

In [287]:
selected_funcs

['spect_slope']

## Experiment Notes

'quantile', 'rms', 'std', 'zero_crossings', 'ptp_amp', 'teager_kaiser_energy', 'wavelet_coef_energy', 'variance', 'hjorth_mobility_spect'

Memory Alloc Error: 'svd_fisher_info', 'svd_entropy'
Index Error: 'spect_edge_freq'

- wavelet_coef_energy (160 dari n_channels * 6)
12	FP2_0	0.000772
55	FC5_1	0.000379
0	FP1_0	0.000314
92	C3_2	0.000124
56	FC5_2	0.000105
28	F3_4	0.000102
174	P8_0	0.000096
163	Pz_1	0.000092
121	CP5_1	0.000089
19	F7_1	0.000084

- variance
0	FP1	0.000186
1	FPz	0.000140

- ptp_amp
1	FPz	0.000879
0	FP1	0.000366
8	FT9	0.000062
- quantile
0	FP1	0.027407
- std
0	FP1	0.008499
1	FPz	0.006849
- rms
1	FPz	0.009563
0	FP1	0.00628

- teager_kaiser_energy (93 dari n_channels * 14)
42	F7_0_mean	0.001042
9	FP1_4_std	0.000675
367	P3_1_std	0.000628
177	FC6_4_std	0.000591
353	P7_1_std	0.000580
70	Fz_0_mean	0.000564
170	FC6_1_mean	0.000533
227	Cz_1_std	0.000458
336	TP10_0_mean	0.000454
199	T7_1_std	0.000399
422	O1_1_mean	0.000363
189	FT10_3_std	0.000314
339	TP10_1_std	0.000288
327	CP6_2_std	0.000275
269	TP9_1_std	0.000275
259	T8_3_std	0.000260
351	P7_0_std	0.000258
197	T7_0_std	0.000226
73	Fz_1_std	0.000224
77	Fz_3_std	0.00020


- zero_crossings
9	FC5	0.000791
5	Fz	0.000673
28	P4	0.000409
2	FP2	0.000351
6	F4	0.000328
1	FPz	0.000254
29	P8	0.000240
12	FC6	0.000227
16	Cz	0.000201
0	FP1	0.000111
7	F8	0.000103
18	T8	0.000077
13	FT10	0.000038
11	FC2	0.000027
14	T7	0.000025
20	CP5	0.000021


- hjorth_complexity_spect
15	C3	3.968751e-08
4	F3	2.224511e-08
27	Pz	1.177913e-08
30	O1	9.042004e-09
20	CP5	8.961986e-09
26	P3	7.410903e-09
21	CP1	5.200852e-09
25	P7	4.460091e-09
14	T7	2.510036e-09
5	Fz	1.619238e-09
31	O2	1.173183e-09
10	FC1	1.139675e-09
29	P8	8.968749e-10
16	Cz	8.757678e-10
0	FP1	8.630175e-10
28	P4	7.337888e-10
22	CP2	7.156483e-10
19	TP9	5.454429e-10
12	FC6	3.172492e-10
2	FP2	2.413746e-10
7	F8	2.129390e-10
1	FPz	1.723450e-10
18	T8	1.599379e-10
24	TP10	1.052916e-10
6	F4	9.623960e-11
23	CP6	8.376070e-11
11	FC2	7.995312e-11
9	FC5	5.912040e-11
17	C4	4.242408e-11
13	FT10	2.149638e-11
8	FT9	1.418549e-11
3	F7	5.620082e-12

- hjorth_mobility_spect
15	C3	2.228330e-04
4	F3	1.145059e-04
20	CP5	7.796345e-05
30	O1	5.695755e-05
27	Pz	5.550135e-05
9	FC5	2.906413e-05
14	T7	2.347822e-05
5	Fz	1.552919e-05
10	FC1	1.094639e-05
31	O2	7.006432e-06
0	FP1	5.661286e-06
29	P8	5.292250e-06
22	CP2	5.092633e-06
16	Cz	4.707477e-06
28	P4	4.146197e-06
12	FC6	3.509663e-06
1	FPz	3.065779e-06
19	TP9	2.943010e-06
24	TP10	2.640253e-06
2	FP2	1.993789e-06
7	F8	1.704866e-06
3	F7	1.626791e-06
18	T8	1.484777e-06
6	F4	1.427815e-06
8	FT9	4.988272e-07
23	CP6	4.035867e-07
13	FT10	1.591099e-07
11	FC2	1.218014e-07
17	C4	2.629278e-08

- kurtosis Nil
- hjorth_complexity Nil
- katz_fd Nil
- skewness Nil
- spect_entropy Nil
- spect_slope Nil
- pow_freq_bands Nil
- line_length Nil

In [365]:
pipeline = Pipeline([
                     ('model',Lasso())
])
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=1
                      )
search.fit(X_train_poly,y_train)
search.best_params_
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)

feature_score_basic = pd.DataFrame({'feature':list(X_train_poly.keys()),'importance':importance})
feature_score_basic_sorted = feature_score_basic.sort_values(by='importance',ascending=False)

Fitting 5 folds for each of 99 candidates, totalling 495 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [374]:
def splitFreqBandFeatureScore(feature_score_basic, n_band=6):
    feature_score_basic_sorted_list = [0,]*n_band
    for i in range (0, n_band):
        feature_score_basic_sorted_list[i] = feature_score_basic[i::n_band].sort_values(by='importance',ascending=False)
    return feature_score_basic_sorted_list

In [385]:
def getFeaturePerformance(X_data:pd.DataFrame, y):
    pipeline = Pipeline([
                         ('model',Lasso())
    ])
    search = GridSearchCV(pipeline,
                          {'model__alpha':np.arange(0.1,10,0.1)},
                          cv = 5, scoring="neg_mean_squared_error",verbose=-1
                          )
    search.fit(X_data ,y)
    search.best_params_
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)

    
    feature_score_basic = pd.DataFrame({'feature':list(X_data.keys()),'importance':importance})
    feature_score_basic_sorted = feature_score_basic.sort_values(by='importance',ascending=False)

    return feature_score_basic, feature_score_basic_sorted

In [386]:
X_train_fe_df_score, X_train_fe_df_score_sorted = getFeaturePerformance(X_train_fe_df, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [337]:
# feature_score_basic = pd.DataFrame({'feature':list(X_data_to_experiment.keys()),'importance':importance})
# feature_score_basic_sorted = feature_score_basic.sort_values(by='importance',ascending=False)

In [372]:
feature_score_basic_sorted.head(50)

Unnamed: 0,feature,importance
463,quantile-F3 zero_crossings-FC1,0.00089
925,quantile-FC1 zero_crossings-FT10,0.000446
543,quantile-Fz zero_crossings-FC2,0.00035
3379,ptp_amp-Fz^2,0.000333
134,quantile-FP1 zero_crossings-F8,0.000325
700,quantile-F8 zero_crossings-FT10,0.000322
222,quantile-FPz zero_crossings-FC6,0.000321
692,quantile-F8 zero_crossings-Fz,0.000309
3623,variance-F4 variance-FC1,0.000288
373,quantile-F7 zero_crossings-FP1,0.000287


In [390]:
X_fe_df = extractFeatures(X, selected_funcs, ch_names=ch_names)

['quantile', 'rms', 'std', 'zero_crossings', 'ptp_amp', 'variance']
('FP1', 'FPz', 'FP2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FT9', 'FC5', 'FC1', 'FC2', 'FC6', 'FT10', 'T7', 'C3', 'Cz', 'C4', 'T8', 'TP9', 'CP5', 'CP1', 'CP2', 'CP6', 'TP10', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'O2')


In [397]:
X_fe_df.to_csv(datasetPath+'X_fe.csv')

# **Format Y data**
Original is 1-9
Modified (2):
- 0-8 (y_9)
- 0-4 (y_4)

In [430]:
y_df = pd.DataFrame(y, columns=['label'])
y_df['label'] = y_df['label'].apply(lambda x: x-1)
print(y_df['label'].value_counts())

5    176
1    168
8    142
6    104
4     97
3     90
0     53
2     48
Name: label, dtype: int64


In [427]:
y_df.to_csv(datasetPath+'y_9.csv')

In [431]:
# if y is 1, 2, 4, 5, then change to 1
y_df['label'] = y_df['label'].apply(lambda x: 0 if x in [0,1,3,4] else 1 if x in [2,5] else 2 if x in [6,7] else 3)
print(y_df['label'].value_counts())


0    408
1    224
3    142
2    104
Name: label, dtype: int64


In [432]:
y_df.to_csv(datasetPath+'y_4.csv')

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8155937/#sec2-sensors-21-03414