# [운동 동작 분류 AI - Private 2위 공유 코드](https://dacon.io/codeshare/2396?dtype=recent&s_id=0)

1D-CNN 에 global-average-pooling

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random 
import time
import datetime
import warnings 
warnings.filterwarnings('ignore')

from scipy import signal
from scipy import fftpack
from tqdm import tqdm
from numpy.fft import fft, fftshift
from numpy.fft import *


from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
train = pd.read_csv('../workout/train_features.csv')
test = pd.read_csv('../workout/test_features.csv')
train_label = pd.read_csv('../workout/train_labels.csv')

In [8]:
train

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112
1,0,1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629
3,0,3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013
4,0,4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234
...,...,...,...,...,...,...,...,...
1874995,3124,595,-0.712530,-0.658357,0.293707,-29.367857,-104.013664,-76.290437
1874996,3124,596,-0.683037,-0.658466,0.329223,-30.149089,-101.796809,-76.625087
1874997,3124,597,-0.664730,-0.666625,0.364114,-27.873095,-98.776072,-79.365125
1874998,3124,598,-0.630534,-0.682565,0.373696,-23.636550,-99.139495,-80.259478


## Feature Engineering 
### 1. 에너지 변수 생성
가속도, 자이로, (자이로-가속도) 센서값을 에너지로 표현 

[<img src = "가속도에너지.png">](https://patents.google.com/patent/KR20120049076A/ko)

In [9]:
train['acc_Energy'] = ( train['acc_x'] ** 2 + train['acc_y'] ** 2 + train['acc_z'] ** 2 ) ** (1/3) # 세제곱근의 이유는??? 
train['gy_Energy'] = (train['gy_x'] ** 2 + train['gy_y'] ** 2 + train['gy_z'] ** 2) ** (1/3)
train['gy_acc_Energy'] = ((train['gy_x'] - train['acc_x']) ** 2 +\
                          (train['gy_y'] - train['acc_y']) ** 2 + \
                          (train['gy_z'] - train['acc_z']) ** 2 ) ** (1/3)    

In [10]:
test['acc_Energy'] = ( test['acc_x'] ** 2 + test['acc_y'] ** 2 + test['acc_z'] ** 2 ) ** (1/3) # 세제곱근의 이유는??? 
test['gy_Energy'] = (test['gy_x'] ** 2 + test['gy_y'] ** 2 + test['gy_z'] ** 2) ** (1/3)
test['gy_acc_Energy'] = ((test['gy_x'] - test['acc_x']) ** 2 +\
                          (test['gy_y'] - test['acc_y']) ** 2 + \
                          (test['gy_z'] - test['acc_z']) ** 2 ) ** (1/3)  

### 2. 시간 대비 변화량 변수 생성
id 별 데이터는 0.02 초마다 측정된 값임으로 이전 시간 대비 변화량을 적용

In [11]:
dt = 0.02

def jerk_signal(signal):
    a = [ (signal[i + 1] - signal[i])/dt 
         for i in range(len(signal) -1) ]
    
    return np.array(a)

In [12]:
train_df = []

for i in tqdm(train['id'].unique()): # tqbm : 상태 진행률을 알 수 있는 라이브러리 함수   
    temp = train.loc[train['id'] == i]
    
    for v in train.columns[2:]:
        values = jerk_signal(temp[v].values)
        values = np.insert(values, 0,0)
        temp.loc[:, v +'_dt'] = values
        
    train_df.append(temp)

100%|██████████| 3125/3125 [01:09<00:00, 45.06it/s]


In [17]:
train_df[1].head(3)

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_Energy,gy_Energy,gy_acc_Energy,acc_x_dt,acc_y_dt,acc_z_dt,gy_x_dt,gy_y_dt,gy_z_dt,acc_Energy_dt,gy_Energy_dt,gy_acc_Energy_dt
600,1,0,-0.211795,-0.07876,0.854627,18.231943,10.211164,7.390348,0.921085,7.890641,7.877131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601,1,1,-0.256963,-0.018555,0.802172,26.524263,0.980852,7.947258,0.892051,9.156352,9.162819,-2.258425,3.010219,-2.622744,414.615996,-461.515561,27.845534,-1.451712,63.285557,64.284406
602,1,2,-0.257005,-0.001413,0.806032,17.56939,-3.9532,1.982548,0.89451,6.898128,6.943637,-0.002074,0.857103,0.193035,-447.743657,-246.702614,-298.23549,0.122978,-112.911205,-110.959118


In [18]:
test_df = []

for i in tqdm(test['id'].unique()) : 
    temp = test.loc[test['id'] == i ]
    
    for v in test.columns[2:] : 
        values = jerk_signal(temp[v].values)
        values = np.insert(values, 0,0)
        temp.loc[:, v+'_dt'] = values 
        
    test_df.append(temp)

100%|██████████| 782/782 [00:16<00:00, 47.25it/s]


In [19]:
test_df[1].head(3)

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_Energy,gy_Energy,gy_acc_Energy,acc_x_dt,acc_y_dt,acc_z_dt,gy_x_dt,gy_y_dt,gy_z_dt,acc_Energy_dt,gy_Energy_dt,gy_acc_Energy_dt
600,3126,0,0.304222,1.529324,-0.338265,62.204081,-20.624492,-65.532643,1.365448,20.479452,20.466287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601,3126,1,0.41426,1.676796,-0.318979,58.01682,-11.494501,-64.522573,1.455745,19.713617,19.672706,5.501907,7.373596,0.964346,-209.363042,456.499577,50.503475,4.514839,-38.29175,-39.679035
602,3126,2,0.411078,1.708743,-0.34312,33.82916,11.935488,-59.098773,1.474613,16.844643,16.719243,-0.159097,1.597344,-1.207063,-1209.383042,1171.499427,271.19002,0.943391,-143.448672,-147.673179


### 3. 푸리에 변환 
[푸리에 변환(Fourier transform, FT)](https://ko.wikipedia.org/wiki/푸리에_변환)은 시간이나 공간에 대한 함수를 시간 또는 공간 주파수 성분으로 분해하는 변환.     
`fftpack`을 활용하여 고속푸리에변환(FFT, Fast Fourier Transform) 을 할 수 있음.  

In [21]:
# FFT 

def fourier_transform_one_signal(t_signal):
    complex_f_signal = fftpack.fft(t_signal)      # fft() : FFT 계산 
    amplitude_f_signal = np.abs(complex_f_signal) # 음수 처리 
    
    return amplitude_f_signal

In [22]:
train = pd.concat(train_df)
train

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_Energy,gy_Energy,gy_acc_Energy,acc_x_dt,acc_y_dt,acc_z_dt,gy_x_dt,gy_y_dt,gy_z_dt,acc_Energy_dt,gy_Energy_dt,gy_acc_Energy_dt
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112,1.146962,12.465436,12.427938,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216,1.200703,12.913284,12.865692,4.080495,-0.980114,-1.699854,44.735403,-429.504677,337.444793,2.687024,22.392358,21.887693
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629,1.217403,13.725729,13.692643,0.845632,0.192961,-3.546937,-196.018888,-249.173073,-4.620631,0.835012,40.622253,41.347563
3,0,3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013,1.209981,15.374021,15.314907,-0.575711,-1.762585,1.908626,316.513181,-473.763910,-121.719195,-0.371100,82.414636,81.113199
4,0,4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234,1.211254,16.074363,16.017964,0.389598,2.130453,-0.365665,78.686055,-215.435892,-25.361098,0.063656,35.017060,35.152822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874995,3124,595,-0.712530,-0.658357,0.293707,-29.367857,-104.013664,-76.290437,1.009050,25.963234,25.897316,1.484646,0.303666,0.800069,-150.644663,-34.630282,-8.380088,-0.679712,8.387109,8.432977
1874996,3124,596,-0.683037,-0.658466,0.329223,-30.149089,-101.796809,-76.625087,1.002827,25.784692,25.722482,1.474659,-0.005442,1.775771,-39.061611,110.842743,-16.732496,-0.311171,-8.927089,-8.741727
1874997,3124,597,-0.664730,-0.666625,0.364114,-27.873095,-98.776072,-79.365125,1.006239,25.628060,25.572145,0.915321,-0.407957,1.744566,113.799702,151.036858,-137.001896,0.170620,-7.831611,-7.516832
1874998,3124,598,-0.630534,-0.682565,0.373696,-23.636550,-99.139495,-80.259478,1.001038,25.626266,25.573288,1.709833,-0.796984,0.479107,211.827245,-18.171144,-44.717652,-0.260074,-0.089713,0.057150


In [23]:
fft = []

for i in tqdm(train['id'].unique()):
    temp = train.loc[train['id'] == i] 
    
    for i in train.columns[2:8]: 
        temp[i] = fourier_transform_one_signal(temp[i].values)
        
    fft.append(temp)

train = pd.concat(fft)
train

100%|██████████| 3125/3125 [00:15<00:00, 206.97it/s]


Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_Energy,gy_Energy,gy_acc_Energy,acc_x_dt,acc_y_dt,acc_z_dt,gy_x_dt,gy_y_dt,gy_z_dt,acc_Energy_dt,gy_Energy_dt,gy_acc_Energy_dt
0,0,0,558.797337,131.082711,222.252919,1119.161589,2015.703683,709.264425,1.146962,12.465436,12.427938,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,1,3.233175,15.689279,12.229014,221.599635,361.903330,477.080942,1.200703,12.913284,12.865692,4.080495,-0.980114,-1.699854,44.735403,-429.504677,337.444793,2.687024,22.392358,21.887693
2,0,2,4.832535,8.199566,3.901211,357.200415,430.568986,452.096143,1.217403,13.725729,13.692643,0.845632,0.192961,-3.546937,-196.018888,-249.173073,-4.620631,0.835012,40.622253,41.347563
3,0,3,5.675383,5.330015,2.527445,340.433376,787.558320,467.307109,1.209981,15.374021,15.314907,-0.575711,-1.762585,1.908626,316.513181,-473.763910,-121.719195,-0.371100,82.414636,81.113199
4,0,4,7.415275,7.980024,6.566908,128.188871,1372.095224,715.824074,1.211254,16.074363,16.017964,0.389598,2.130453,-0.365665,78.686055,-215.435892,-25.361098,0.063656,35.017060,35.152822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874995,3124,595,11.743654,3.796333,12.513870,715.873677,1124.494889,645.627066,1.009050,25.963234,25.897316,1.484646,0.303666,0.800069,-150.644663,-34.630282,-8.380088,-0.679712,8.387109,8.432977
1874996,3124,596,211.498089,82.888508,86.807874,5515.261695,28917.564390,20218.747027,1.002827,25.784692,25.722482,1.474659,-0.005442,1.775771,-39.061611,110.842743,-16.732496,-0.311171,-8.927089,-8.741727
1874997,3124,597,12.175349,6.200258,2.084554,343.695161,464.375112,78.097163,1.006239,25.628060,25.572145,0.915321,-0.407957,1.744566,113.799702,151.036858,-137.001896,0.170620,-7.831611,-7.516832
1874998,3124,598,19.116783,3.830800,6.938661,791.376179,2724.373764,1131.590078,1.001038,25.626266,25.573288,1.709833,-0.796984,0.479107,211.827245,-18.171144,-44.717652,-0.260074,-0.089713,0.057150


In [24]:
test = pd.concat(test_df)

In [25]:
fft_t = []

for i in tqdm(test['id'].unique()):
    temp = test.loc[test['id'] == i]
    
    for i in test.columns[2:8]:
        temp[i] = fourier_transform_one_signal(temp[i].values)
        
    fft_t.append(temp)
    
test = pd.concat(fft_t)

100%|██████████| 782/782 [00:02<00:00, 286.63it/s]


### 4. Standard Scaling 

In [27]:
col = train.columns
train_s = train.copy()
test_s = test.copy()

In [28]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
train_s.iloc[:,2:] = scaler.fit_transform(train_s.iloc[:,2:])
train_sc = pd.DataFrame(train_s, columns = col )

test_s.iloc[:, 2:] = scaler.fit_transform(test_s.iloc[:,2:])
test_sc = pd.DataFrame(test_s, columns = col)

In [29]:
train_sc

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_Energy,gy_Energy,gy_acc_Energy,acc_x_dt,acc_y_dt,acc_z_dt,gy_x_dt,gy_y_dt,gy_z_dt,acc_Energy_dt,gy_Energy_dt,gy_acc_Energy_dt
0,0,0,27.356382,8.807207,19.465910,0.376992,0.869226,0.150423,0.495681,-0.272719,-0.276391,0.000027,0.000298,-0.000433,0.000347,0.000373,0.000273,0.000101,0.001505,0.001501
1,0,1,-0.054866,0.833464,0.820412,-0.282128,-0.093560,0.011266,0.742974,-0.236152,-0.240632,0.416836,-0.118821,-0.255054,0.032738,-0.349095,0.377085,0.564992,0.166566,0.162871
2,0,2,0.024046,0.315921,0.081086,-0.182551,-0.053585,-0.003708,0.819822,-0.169815,-0.173080,0.086405,0.023750,-0.531727,-0.141582,-0.202368,-0.004887,0.175645,0.300944,0.306341
3,0,3,0.065632,0.117634,-0.040874,-0.194863,0.154242,0.005408,0.785669,-0.035229,-0.040560,-0.058780,-0.213920,0.285459,0.229520,-0.385106,-0.135647,-0.077915,0.609008,0.599518
4,0,4,0.151477,0.300751,0.317742,-0.350724,0.494539,0.154354,0.791528,0.021954,0.016872,0.039823,0.259227,-0.055206,0.057320,-0.174917,-0.028047,0.013483,0.259626,0.260669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874995,3124,595,0.365037,0.011656,0.845701,0.080839,0.350395,0.112282,-0.138940,0.829394,0.823900,0.151679,0.037205,0.119409,-0.108728,-0.027804,-0.009085,-0.142794,0.063329,0.063674
1874996,3124,596,10.220817,5.476964,7.441373,3.605246,16.530576,11.843241,-0.167578,0.814816,0.809618,0.150658,-0.000363,0.265559,-0.027936,0.090560,-0.018412,-0.065316,-0.064300,-0.062949
1874997,3124,597,0.386337,0.177768,-0.080193,-0.192468,-0.033904,-0.227861,-0.151875,0.802027,0.797338,0.093524,-0.049283,0.260884,0.082744,0.123264,-0.152712,0.035970,-0.056225,-0.053918
1874998,3124,598,0.728823,0.014037,0.350745,0.136284,1.281790,0.403540,-0.175811,0.801880,0.797431,0.174681,-0.096564,0.071332,0.153722,-0.014412,-0.049662,-0.054574,0.000843,0.001922


## 모델링 : 1D-CNN with Global average pooling   

[`tensorflow_addons`](https://www.tensorflow.org/addons/overview?hl=ko) 
    : 새로운 기능을 사용할 수 있도록 하는 repository 

In [41]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_addons as tfa 
import keras

from keras.models import Sequential 
from keras.layers import Conv1D, BatchNormalization, Activation, Dropout
from keras.layers import GlobalAveragePooling1D, Dense

from keras.utils import to_categorical
from keras import backend as K 
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn.model_selection import KFold, StratifiedKFold
from numpy.random import seed

In [38]:
X = np.array(train_sc.iloc[:,2:]).reshape(3125,600,-1)
X.shape

(3125, 600, 18)

In [39]:
test_x = np.array(test_sc.iloc[:,2:]).reshape(782, 600, -1)
test_x.shape

(782, 600, 18)

In [40]:
y = train_label['label'].values
y = to_categorical(y)
y.shape

(3125, 61)

### 1. CNN 모델 구조  

In [50]:
def cnn_model(input_shape, classes):
    
    seed(2021)
    tf.random.set_seed(2021)
    
    input_layer = keras.layers.Input(input_shape)
    
    # 1st
    conv1 = Conv1D(filters = 128, kernel_size = 9, padding ='same')(input_layer)
    conv1 = BatchNormalization()(conv1)
    conv1 = Activation(activation = 'relu')(conv1)
    conv1 = Dropout(rate = 0.3)(conv1)
    
    # 2nd
    conv2 = Conv1D(256, kernel_size = 6, padding = 'same')(conv1)
    conv2 = BatchNormalization()(conv2)
    conv2 = Activation('relu')(conv2)
    conv2 = Dropout(rate = 0.4)(conv2)
    
    # 3rd
    conv3 = Conv1D(128, kernel_size = 3, padding ='same')(conv2)
    conv3 = BatchNormalization()(conv3)
    conv3 = Activation('relu')(conv3)
    conv3 = Dropout(0.5)(conv3)
    
    
    gap = GlobalAveragePooling1D()(conv3)
    
    output_layer = Dense(classes, activation = 'softmax')(gap)
    
    model = keras.models.Model(inputs = input_layer, outputs = output_layer)
    
    model.compile(loss = 'categorical_crossentropy', optimizer = keras.optimizers.Adam(), metrics = ['accuracy'])
    
    return model

### 2. 10 Kold stratifiedKFlod

[`ReduceLROnPlateau()`](https://teddylee777.github.io/tensorflow/keras-콜백함수-vol-01) : 콜백함수로 학습률 조정.    
[`EarlyStopping()`](https://3months.tistory.com/424) : 효율적 운영을 위한 early stoping 

In [43]:
skf = StratifiedKFold(10, random_state = 2021, shuffle = True )
reLR = ReduceLROnPlateau(patience = 4, verbose = 1, factor = 0.5 )
es = EarlyStopping(monitor = 'val_loss', patience = 8, mode = 'min')

[`ModelCheckpoint()`](https://deep-deep-deep.tistory.com/53) : 학습 모델 저장 

In [57]:
accuracy = []
loss = []
models = []

start = time.time()

for i, (train, validation) in enumerate(skf.split(X, y.argmax(1))) : 
    
    # 모델 저장 
    mc = ModelCheckpoint(f'./workout_M/cv_study{i+1}.h5', save_best_only = True,
                        verbose = 0, monitor = 'val_loss', mode ='min', save_weights_only = 'True')
    
    print('-'*20 + 'Fold' + str(i+1) + '-'*20)
    
    model = cnn_model((600, 18), 61)
    history = model.fit(X[train], y[train], epochs = 100, 
                       validation_data = (X[validation], y[validation]), 
                       verbose = 1, batch_size = 64, callbacks = [es, mc, reLR])
    
    model.load_weights(f'./workout_M/cv_study{i+1}.h5')
    
    k_accuracy = '%.4f'%(model.evaluate(X[validation], y[validation])[1])
    k_loss = '%.4f'%(model.evaluate(X[validation], y[validation])[0])
    
    accuracy.append(k_accuracy)
    loss.append(k_loss)
    models.append(model)

sec = time.time() - start
    
print('\nk-fold CV AUC : {}'.format(accuracy))
print('\nk-fold CV Loss : {}'.format(loss))

--------------------Fold1--------------------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100

Epoch 00044: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100

Epoch 00051: ReduceLROnPlateau reducing learning rate to

In [59]:
print('모델링 수행 시간 : ' , str(datetime.timedelta(seconds = sec)).split('.'))

모델링 수행 시간 :  ['1 day, 3:58:29', '701422']


In [72]:
print('평균 정확도 : {0:.4f}'.format(sum(float(i) for i in accuracy)/10 ))
print('평균 loss : {:.4f}'.format(sum(float(i) for i in loss)/10))

평균 정확도 : 0.8531
평균 loss : 0.4854


In [73]:
test_x = np.array(test_sc.iloc[:,2:]).reshape(782, 600, -1)
test_x.shape

(782, 600, 18)

In [74]:
preds = []
for model in models:
    pred = model.predict(test_x)
    preds.append(pred)
    
pred = np.mean(preds, axis = 0)
pred

array([[9.52831033e-06, 1.82786937e-06, 1.48055761e-07, ...,
        4.71508643e-03, 1.09337225e-05, 2.10196731e-06],
       [5.15806896e-04, 2.57043866e-05, 2.40184876e-04, ...,
        1.03424727e-05, 3.31831543e-05, 1.31285633e-05],
       [1.69565273e-03, 2.89031304e-02, 1.65807760e-05, ...,
        6.97948621e-04, 1.06625026e-02, 2.08439166e-03],
       ...,
       [4.84825112e-04, 3.99380997e-06, 1.62373508e-05, ...,
        2.08291185e-05, 1.70811109e-06, 8.31775193e-04],
       [3.78538925e-06, 7.86759134e-04, 9.85837801e-07, ...,
        8.53478781e-08, 1.15160392e-05, 3.43165008e-09],
       [9.88745960e-05, 3.81942937e-06, 1.38630435e-06, ...,
        8.41892615e-05, 8.89487524e-07, 1.45653437e-04]], dtype=float32)