## EEG data grouing

In [117]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 경로 설정
eeg_folder_path = '/Users/sh_oh/Library/CloudStorage/Dropbox/Data/2023-1/BDP/ECSMP_Dataset/EEG_train_csv'

# 파일 목록 가져오기
file_list = os.listdir(eeg_folder_path)

# 그룹별로 파일들을 그룹화
grouped_files = {}
for file_name in file_list:
    group_key = file_name[:11]
    if group_key == ".DS_Store":  # .DS_Store 그룹은 건너뜁니다
        continue
    if group_key not in grouped_files:
        grouped_files[group_key] = []
    grouped_files[group_key].append(file_name)

In [118]:
# 새로운 데이터를 저장할 리스트
new_data = []

# 그룹별로 데이터 처리
for key, file_names in grouped_files.items():
    # 그룹 내 파일들을 읽어와 데이터 리스트에 추가
    group_data = []
    for file_name in file_names:
        file_path = os.path.join(eeg_folder_path, file_name)
        df = pd.read_csv(file_path, encoding='latin1')  # 인코딩 변경
        channel_data = df.iloc[:, 0].values  # 첫 번째 열의 데이터만 사용
        group_data.append(channel_data)
    
    # 그룹 데이터를 평균하여 새로운 데이터 생성
    new_group_data = np.mean(group_data, axis=0)
    new_data.append(new_group_data)

# 저장할 폴더 경로
output_folder_path = '/Users/sh_oh/Library/CloudStorage/Dropbox/Data/2023-1/BDP/ECSMP_Dataset/EEG_train2_csv'

# 데이터를 CSV 파일로 저장
for i, group_data in enumerate(new_data):
    file_name = f"group_{i+1}.csv"
    file_path = os.path.join(output_folder_path, file_name)
    pd.DataFrame(group_data).to_csv(file_path, index=False)

In [119]:
# 그룹 개수 출력
print(f"그룹 개수: {len(grouped_files)}")

# 그룹 이름 출력 (정렬하여)
for group_name in sorted(grouped_files.keys()):
    print(group_name)

그룹 개수: 234
001_video_1
001_video_2
001_video_3
001_video_4
001_video_5
001_video_6
005_video_1
005_video_2
005_video_3
005_video_4
005_video_5
005_video_6
006_video_1
006_video_2
006_video_3
006_video_4
006_video_5
006_video_6
009_video_1
009_video_2
009_video_3
009_video_4
009_video_5
009_video_6
010_video_1
010_video_2
010_video_3
010_video_4
010_video_5
010_video_6
011_video_1
011_video_2
011_video_3
011_video_4
011_video_5
011_video_6
012_video_1
012_video_2
012_video_3
012_video_4
012_video_5
012_video_6
013_video_1
013_video_2
013_video_3
013_video_4
013_video_5
013_video_6
015_video_1
015_video_2
015_video_3
015_video_4
015_video_5
015_video_6
021_video_1
021_video_2
021_video_3
021_video_4
021_video_5
021_video_6
022_video_1
022_video_2
022_video_3
022_video_4
022_video_5
022_video_6
023_video_1
023_video_2
023_video_3
023_video_4
023_video_5
023_video_6
024_video_1
024_video_2
024_video_3
024_video_4
024_video_5
024_video_6
026_video_1
026_video_2
026_video_3
026_video_4
026_v

# EEG preprocessing

In [157]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 경로 설정
eeg_folder_path = '/Users/sh_oh/Library/CloudStorage/Dropbox/Data/2023-1/BDP/ECSMP_Dataset/EEG_train2_csv'

# 파일 목록 가져오기
file_list = os.listdir(eeg_folder_path)

# 전체 데이터를 저장할 리스트
data = []

# 가장 긴 데이터 길이를 기준으로 zero-padding
max_length = 0

# 데이터 읽어오기 및 전처리
for file_name in file_list:
    file_path = os.path.join(eeg_folder_path, file_name)
    df = pd.read_csv(file_path)
    channel_data = df.iloc[:, 0].values  # 첫 번째 열의 데이터만 사용
    data.append(channel_data)
    if len(channel_data) > max_length:
        max_length = len(channel_data)

# Zero-padding
padded_data = pad_sequences(data, maxlen=max_length, padding='post')

# 데이터 정규화
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(padded_data)

In [158]:
# 데이터 타입 확인
print("Data Type:", type(normalized_data))

# 데이터 일부 출력
print("Sample Data:")
print(normalized_data[:5])  # 예시로 처음 2개 데이터 출력

Data Type: <class 'numpy.ndarray'>
Sample Data:
[[0.21246246 0.214514   0.21329929 ... 1.         1.         1.        ]
 [0.21638002 0.21883954 0.21866678 ... 1.         1.         1.        ]
 [0.22294567 0.22428091 0.22318315 ... 1.         1.         1.        ]
 [0.35944581 0.35381309 0.35068501 ... 1.         1.         1.        ]
 [0.8462872  0.85426769 0.8569997  ... 1.         1.         1.        ]]


# Denoising AutoEncoder(DAE) model

In [159]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.optimizers.legacy import Adam

# 가우시안 노이즈 추가
def add_gaussian_noise(data, noise_factor):
    noise = np.random.normal(loc=0.0, scale=noise_factor, size=data.shape)
    noisy_data = data + noise
    return noisy_data

In [160]:
# Denoising Autoencoder 모델 생성
def create_denoising_autoencoder(input_shape, encoding_dim):
    input_layer = Input(shape=input_shape)
    
    # Encoder
    encoded = Reshape((input_shape[0], 1))(input_layer)
    encoded = Conv1D(128, kernel_size=3, activation='relu', padding='same')(encoded)
    encoded = Conv1D(64, kernel_size=3, activation='relu', padding='same')(encoded)
    encoded = Conv1D(32, kernel_size=3, activation='relu', padding='same')(encoded)
    encoded = Conv1D(encoding_dim, kernel_size=3, activation='relu', padding='same')(encoded)
    encoded = Flatten()(encoded)

    # Decoder
    decoded = Reshape((1, np.prod(input_shape)//encoding_dim))(encoded)
    decoded = Conv1DTranspose(32, kernel_size=3, activation='relu', padding='same')(decoded)
    decoded = Conv1DTranspose(64, kernel_size=3, activation='relu', padding='same')(decoded)
    decoded = Conv1DTranspose(128, kernel_size=3, activation='relu', padding='same')(decoded)
    decoded = Conv1DTranspose(1, kernel_size=3, activation='sigmoid', padding='same')(decoded)
    decoded = Reshape(input_shape)(decoded)

    # Autoencoder
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    # Encoder
    encoder = Model(inputs=input_layer, outputs=encoded)

    return autoencoder, encoder

In [162]:
# 입력 데이터의 형태
input_shape = normalized_data.shape[1:]
encoding_dim = 64

# 가우시안 노이즈 추가
noise_factor = 0.5
noisy_data = add_gaussian_noise(normalized_data, noise_factor)

# Denoising Autoencoder 모델 생성
autoencoder, encoder = create_denoising_autoencoder(input_shape, encoding_dim)

# 모델 학습
autoencoder.fit(noisy_data, normalized_data, epochs=30, batch_size=16, shuffle=True, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2edc195a0>

# feature extraction

In [163]:
# 특징 추출 (인코더의 출력)
eeg_train_features = encoder.predict(normalized_data)



In [164]:
print(eeg_train_features.shape)

(234, 64)


In [165]:
eeg_train_features

array([[11.6729355, 12.994658 ,  0.       , ...,  0.       ,  0.       ,
         9.369478 ],
       [11.948198 , 12.608524 ,  0.       , ...,  0.       ,  0.       ,
        10.511625 ],
       [11.859114 , 12.108788 ,  0.       , ...,  0.       ,  0.       ,
        10.715914 ],
       ...,
       [10.999057 ,  7.7388725,  0.       , ...,  0.       ,  0.       ,
        11.660636 ],
       [ 9.300373 ,  1.5316507,  0.       , ...,  0.       ,  3.590412 ,
        10.831481 ],
       [11.186989 , 14.51754  ,  0.       , ...,  0.       ,  0.       ,
         4.5714107]], dtype=float32)

In [166]:
#특징 변환 및 저장
df_encoded = pd.DataFrame(eeg_train_features)
output_file = "eeg_train_features.csv"
df_encoded.to_csv(output_file, index=False)