# 0. understanding EEG data sets

data from [UCI repository](https://archive.ics.uci.edu/ml/datasets/EEG+Database)

EEG 데이터셋에 대해 이해하기 위한 노트북

이 데이터셋은 알코올 중독자와 대조군에 대해 모집한 EEG 데이터임.

In [1]:
import os
from datetime import datetime as dt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [2]:
data_dir = '../../data/smni_eeg_data'
store_dir = '../../data/eeg-project/0.merge'

## 데이터 불러오기

In [3]:
datalist = [x for x in os.listdir(data_dir) if (not x.endswith('tar.gz')) & (not x.startswith('.'))]
datalist

['a_1_co2a0000364',
 'c_n_co2c0000337',
 'c_m_co2c0000337',
 'a_m_co2a0000364',
 'c_1_co2c0000337',
 'a_n_co2a0000364']

In [4]:
datapath_list = []

for dirs in datalist:
    tmp_path = os.path.join(data_dir, dirs)
    datapath_list += [os.path.join(tmp_path, x) for x in os.listdir(tmp_path)]

print(len(datapath_list))

60


In [5]:
datapath_list

['../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.012',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.024',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.022',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.014',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.000',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.020',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.018',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.028',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.010',
 '../../data/smni_eeg_data/a_1_co2a0000364/co2a0000364.rd.002',
 '../../data/smni_eeg_data/c_n_co2c0000337/co2c0000337.rd.029',
 '../../data/smni_eeg_data/c_n_co2c0000337/co2c0000337.rd.073',
 '../../data/smni_eeg_data/c_n_co2c0000337/co2c0000337.rd.087',
 '../../data/smni_eeg_data/c_n_co2c0000337/co2c0000337.rd.089',
 '../../data/smni_eeg_data/c_n_co2c0000337/co2c0000337.rd.021',
 '../../data/smni_eeg_data/c_n_co2c00003

In [6]:
data_list = []

for path in datapath_list:
    with open(path, 'r') as f:
        data_list.append(f.readlines())

len(data_list)

60

## 데이터 내용 확인하기

```
Attribute Information:

Each trial is stored in its own file and will appear in the following format. 

# co2a0000364.rd 
# 120 trials, 64 chans, 416 samples 368 post_stim samples 
# 3.906000 msecs uV 
# S1 obj , trial 0 
# FP1 chan 0 
0 FP1 0 -8.921 
0 FP1 1 -8.433 
0 FP1 2 -2.574 
0 FP1 3 5.239 
0 FP1 4 11.587 
0 FP1 5 14.028 
... 

The first four lines are header information. Line 1 contains the subject identifier and indicates if the subject was an alcholic (a) or control (c) subject by the fourth letter. Line 4 identifies the matching conditions: a single object shown (S1 obj), object 2 shown in a matching condition (S2 match), and object 2 shown in a non matching condition (S2 nomatch). 

Line 5 identifies the start of the data from sensor FP1. The four columns of data are: the trial number, sensor position, sample number (0-255), and sensor value (in micro volts). 
```

위를 통해, alcoholism 여부를 분류하는 시계열 모델을 만들 수 있으며, 이 데이터는 다변량 시계열 데이터임을 알 수 있음

In [7]:
header = ['trial_number', 'sensor_position', 'sample_number', 'sensor_value']

파일 선택

In [8]:
first_file = data_list[0]
first_file

['# co2a0000364.rd\n',
 '# 120 trials, 64 chans, 416 samples 368 post_stim samples\n',
 '# 3.906000 msecs uV\n',
 '# S1 obj , trial 12\n',
 '# FP1 chan 0\n',
 '12 FP1 0 3.052\n',
 '12 FP1 1 4.517\n',
 '12 FP1 2 4.028\n',
 '12 FP1 3 4.028\n',
 '12 FP1 4 4.517\n',
 '12 FP1 5 5.493\n',
 '12 FP1 6 5.493\n',
 '12 FP1 7 4.517\n',
 '12 FP1 8 3.052\n',
 '12 FP1 9 2.075\n',
 '12 FP1 10 3.540\n',
 '12 FP1 11 6.470\n',
 '12 FP1 12 8.423\n',
 '12 FP1 13 8.423\n',
 '12 FP1 14 5.981\n',
 '12 FP1 15 2.563\n',
 '12 FP1 16 0.610\n',
 '12 FP1 17 0.610\n',
 '12 FP1 18 2.075\n',
 '12 FP1 19 3.052\n',
 '12 FP1 20 3.052\n',
 '12 FP1 21 2.563\n',
 '12 FP1 22 2.563\n',
 '12 FP1 23 3.052\n',
 '12 FP1 24 3.540\n',
 '12 FP1 25 2.075\n',
 '12 FP1 26 0.122\n',
 '12 FP1 27 -2.808\n',
 '12 FP1 28 -3.784\n',
 '12 FP1 29 -2.808\n',
 '12 FP1 30 -0.854\n',
 '12 FP1 31 1.099\n',
 '12 FP1 32 2.075\n',
 '12 FP1 33 2.563\n',
 '12 FP1 34 1.587\n',
 '12 FP1 35 0.610\n',
 '12 FP1 36 -0.366\n',
 '12 FP1 37 -0.854\n',
 '12 FP1 3

파일에서 필요한 정보 추출하기

In [9]:
file_name = first_file[0].rstrip().split(' ')[1]
target = file_name[3]
trial_num = int(first_file[3].rstrip().split(' ')[-1])

file_info = [file_name, target, trial_num]

file_info

['co2a0000364.rd', 'a', 12]

데이터에서 행렬 데이터 추출하기

In [10]:
file_ctxt = [x.rstrip().split(' ') for x in first_file[4:] if not x.startswith('#')]

df_data = pd.DataFrame(file_ctxt, columns=header)
df_data['sample_number'] = df_data['sample_number'].astype('int')

df_pivot = df_data.pivot(index='sample_number', columns='sensor_position', values='sensor_value')
print(df_pivot.shape)
df_pivot.head()

(256, 64)


sensor_position,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,C5,...,PO8,POZ,PZ,T7,T8,TP7,TP8,X,Y,nd
sample_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.96,-0.427,6.999,2.055,2.289,-1.923,2.97,-2.228,9.43,-8.433,...,10.854,-1.16,-0.397,-0.671,9.013,-5.381,5.656,0.997,-2.38,3.031
1,3.448,1.038,6.51,4.985,2.289,-0.458,2.97,-4.669,2.594,-10.874,...,13.784,0.793,0.58,1.282,-1.241,-4.405,7.609,1.485,-3.357,4.008
2,2.96,2.991,5.534,8.403,2.777,1.984,1.994,7.05,7.965,-6.968,...,4.995,2.258,1.068,3.235,-6.612,-0.498,6.632,1.973,-1.404,4.496
3,3.448,5.432,6.022,9.867,3.754,6.866,-5.819,28.046,-13.031,2.309,...,-3.794,2.747,1.068,7.141,-5.636,3.896,3.215,4.415,4.456,4.496
4,4.425,7.385,7.487,7.914,6.195,-1.434,2.482,-12.482,3.571,10.61,...,5.971,3.235,1.068,10.559,-0.753,7.314,-0.203,7.345,10.803,4.985


In [11]:
data_info_list = []
data_eeg_list = []

data_index = []
data_header = []


for file in data_list:
    
    # file info
    file_name = file[0].rstrip().split(' ')[1]
    target = file_name[3]
    trial_num = int(file[3].rstrip().split(' ')[-1])

    file_info = [file_name, target, trial_num]

    # file contexts
    file_ctxt = [x.rstrip().split(' ') for x in file[4:] if not x.startswith('#')]

    df_data = pd.DataFrame(file_ctxt, columns=header)
    df_data['sample_number'] = df_data['sample_number'].astype('int')

    df_pivot = df_data.pivot(index='sample_number', columns='sensor_position', values='sensor_value')
    df_pivot = df_pivot.sort_index(axis=0).sort_index(axis=1)
    
    data_index = df_pivot.index
    data_header = df_pivot.columns
    
    data_info_list.append(file_info)
    data_eeg_list.append(df_pivot)

In [12]:
data_info_list = np.array(data_info_list)
data_eeg_list = np.array(data_eeg_list)

data_info_list.shape, data_eeg_list.shape

((60, 3), (60, 256, 64))

In [13]:
data_index, data_header

(Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
             ...
             246, 247, 248, 249, 250, 251, 252, 253, 254, 255],
            dtype='int64', name='sample_number', length=256),
 Index(['AF1', 'AF2', 'AF7', 'AF8', 'AFZ', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
        'CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6', 'CPZ', 'CZ', 'F1', 'F2', 'F3',
        'F4', 'F5', 'F6', 'F7', 'F8', 'FC1', 'FC2', 'FC3', 'FC4', 'FC5', 'FC6',
        'FCZ', 'FP1', 'FP2', 'FPZ', 'FT7', 'FT8', 'FZ', 'O1', 'O2', 'OZ', 'P1',
        'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'PO1', 'PO2', 'PO7', 'PO8',
        'POZ', 'PZ', 'T7', 'T8', 'TP7', 'TP8', 'X', 'Y', 'nd'],
       dtype='object', name='sensor_position'))

# 데이터 저장

In [14]:
date = dt.strftime(dt.now(), '%y-%m-%d')

np.save(os.path.join(store_dir, f'data_info_{date}.npy'), data_info_list)
np.save(os.path.join(store_dir, f'data_eeg_{date}.npy'), data_eeg_list)

In [15]:
%store data_index
%store data_header

Stored 'data_index' (Int64Index)
Stored 'data_header' (Index)
