# 운동 동작 분류 AI 경진대회
### [ 국경원 요원 ]

# 개발 환경
- OS : Windows 10 Pro
- Python Ver : 3.8.5
- tensorflow Ver: 2.4.1

# 목차
<font color="red"><Br>
1.모듈 및 파일 로드<br>
2.설명변수 생성 및 전처리<br>
3.학습및 예측

<font color="red"><Br>
# 1.모듈 및 파일 로드

In [1]:
import pandas as pd
import numpy as np
import os
import random
import warnings

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import MultiHeadAttention

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

from tqdm import tqdm
warnings.filterwarnings(action='ignore')

print(tf.__version__)
print(keras.__version__)

2.3.0
2.4.0


In [2]:
# 매번 모델링을 할 때마다 동일한 결과를 얻으려면 아래 코드를 실행
def reset_seeds(seed, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  

    np.random.seed(seed)
    random.seed(seed+100)
    tf.compat.v1.set_random_seed(seed+200)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  
    print("RANDOM SEEDS RESET {}".format(seed))  

In [3]:
SEED = 1
reset_seeds(SEED)

RANDOM SEEDS RESET 1


- 데이터 경로

In [4]:
DATA_PATH = "./"
SUB_PATH = "./"

In [5]:
train=pd.read_csv(f'{DATA_PATH}train_features.csv')
train_labels=pd.read_csv(f'{DATA_PATH}train_labels.csv')
test=pd.read_csv(f'{DATA_PATH}test_features.csv')
submission=pd.read_csv(f'{DATA_PATH}sample_submission.csv')

# 진행률 확인
tqdm.pandas()

<font color="red"><Br>
# 2.설명변수 생성 및 전처리

### - 별도의 히든레이어에 넣을 설명변수 생성

In [6]:
tr_id = pd.DataFrame(train.id.unique(),columns=["id"])
te_id = pd.DataFrame(test.id.unique(),columns=["id"])
ft_train = tr_id.copy()
ft_test = te_id.copy()

- 각 변수에 대한 max,min,mean,std 집계를 하여 설명변수를 생성
- 각 변수에 기울기에 대한 mean , std ,sum 집계를 하여 설명변수를 생성

In [7]:
cols = ['id']
cols.extend(train.iloc[:,2:].columns.tolist())
f_list = [
            ('max', 'max' ), 
            ('min', 'min' ),
            ('mean', 'mean' ),
            ('std', 'std' ),
            (
            'gradient_mean', 
             lambda x : np.gradient(x).mean()
            ), 
            (
            'gradient_std', 
             lambda x : np.gradient(x).std()
            ), 
            
             (
            'gradient_sum', 
             lambda x : np.gradient(x).sum()
            ),
            
        ]

In [8]:
# 학습 데이터
f_ = train[cols].groupby("id").agg(f_list)
f_.columns = [ f"{c1}_{c2}"  for c1, c2 in f_.columns ]
f_ = f_.reset_index()
f_

Unnamed: 0,id,acc_x_max,acc_x_min,acc_x_mean,acc_x_std,acc_x_gradient_mean,acc_x_gradient_std,acc_x_gradient_sum,acc_y_max,acc_y_min,...,gy_y_gradient_mean,gy_y_gradient_std,gy_y_gradient_sum,gy_z_max,gy_z_min,gy_z_mean,gy_z_std,gy_z_gradient_mean,gy_z_gradient_std,gy_z_gradient_sum
0,0,1.344268,0.591940,0.931329,0.191479,0.000044,0.036163,0.026181,0.176871,-0.624113,...,0.074032,4.366987,44.419041,55.953827,-79.930029,1.182107,25.275185,0.121532,3.727328,72.919282
1,1,1.234020,-2.156208,-0.766580,0.495528,0.000522,0.107822,0.313130,0.700065,-1.295598,...,-0.135346,16.600487,-81.207515,340.170199,-270.980823,1.393294,75.545343,-0.047440,14.139488,-28.464187
2,2,1.219836,-1.142847,0.039836,0.711972,-0.001042,0.092330,-0.625047,0.650645,-0.690990,...,0.541111,13.716743,324.666586,55.642836,-44.192071,3.053291,13.920337,0.049267,5.572570,29.560015
3,3,-0.622250,-1.417751,-0.887702,0.130899,0.000379,0.046002,0.227675,0.283721,-0.540827,...,-0.040955,8.919657,-24.573111,56.456908,-85.600536,-5.869898,23.647153,0.113235,4.752998,67.940966
4,4,0.599720,-2.429109,-0.659018,0.495170,-0.001961,0.090496,-1.176893,1.724782,-2.055076,...,-0.562629,15.199774,-337.577444,221.015193,-270.581913,4.453382,46.148326,-0.570786,14.034324,-342.471783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3120,3120,0.390798,-1.624711,-0.300454,0.403175,-0.000358,0.084204,-0.215055,0.168070,-1.289257,...,0.037671,9.503145,22.602452,121.958427,-79.392292,-0.054026,24.913819,0.026928,6.510302,16.156778
3121,3121,-0.446650,-1.575455,-0.974298,0.169963,-0.000620,0.038752,-0.371789,0.117965,-0.609743,...,0.023935,2.937453,14.361012,57.349878,-39.777626,-2.792238,12.786464,0.010601,2.786408,6.360638
3122,3122,0.744666,-2.578974,-1.114246,0.683789,-0.000555,0.129473,-0.332854,1.268138,-2.036646,...,0.940091,22.509199,564.054404,453.943910,-247.908573,-1.722830,131.916609,-0.624373,14.682723,-374.623971
3123,3123,0.915846,-0.929133,-0.111333,0.432722,-0.000964,0.057924,-0.578330,1.473727,0.272406,...,-0.315875,12.454230,-189.525145,310.558507,-206.580638,-5.930252,71.243150,0.049862,10.912310,29.917346


In [9]:
ft_train = pd.merge(ft_train,f_,how="left")

In [10]:
# 테스트 데이터
f_ = test[cols].groupby("id").agg(f_list)
f_.columns = [ f"{c1}_{c2}"  for c1, c2 in f_.columns ]
f_ = f_.reset_index()
f_

Unnamed: 0,id,acc_x_max,acc_x_min,acc_x_mean,acc_x_std,acc_x_gradient_mean,acc_x_gradient_std,acc_x_gradient_sum,acc_y_max,acc_y_min,...,gy_y_gradient_mean,gy_y_gradient_std,gy_y_gradient_sum,gy_z_max,gy_z_min,gy_z_mean,gy_z_std,gy_z_gradient_mean,gy_z_gradient_std,gy_z_gradient_sum
0,3125,-0.275446,-1.564000,-1.018731,0.236232,-0.000607,0.052546,-0.364450,0.228040,-0.470937,...,-0.128438,5.668033,-77.062604,49.981455,-35.446915,-2.000683,12.251648,-0.006010,3.289276,-3.605857
1,3126,0.627571,-1.929033,-0.522843,0.539688,-0.002408,0.065952,-1.444505,1.708743,-0.200678,...,0.073970,7.995967,44.381857,169.417650,-147.597574,-3.604579,61.604867,0.193885,7.745503,116.331005
2,3127,2.972063,-0.792916,0.506947,0.219934,-0.000619,0.136834,-0.371407,1.941820,0.219008,...,0.087458,6.366008,52.474878,97.211730,-154.477074,-0.393175,23.041463,-0.030464,9.410068,-18.278557
3,3128,0.337281,-1.045889,-0.577603,0.431713,0.000151,0.024039,0.090362,-0.258476,-1.294482,...,-0.044294,5.093294,-26.576601,167.860762,-117.297766,-0.024318,37.967372,-0.009246,4.504459,-5.547546
4,3129,0.015642,-2.153047,-0.738640,0.305797,-0.000529,0.090366,-0.317549,1.562602,-0.860883,...,-0.080270,13.603918,-48.161825,138.130133,-125.598600,5.745498,43.353007,-0.257599,9.722807,-154.559603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,3902,0.427159,-2.050254,-0.907299,0.352604,-0.000200,0.105253,-0.120011,3.057501,-1.414874,...,-0.387381,15.964694,-232.428525,214.192019,-241.419881,-0.429678,68.462972,0.078550,12.926505,47.130277
778,3903,1.659451,-1.709527,-0.608731,0.663522,0.002253,0.098773,1.351889,1.549890,-1.247963,...,0.189548,11.132063,113.728549,253.689077,-164.337764,13.272141,81.398849,-0.068578,14.228869,-41.146929
779,3904,-0.085249,-2.124959,-0.753193,0.252666,0.000170,0.067927,0.101945,1.236138,-0.443533,...,0.202383,6.845427,121.429570,251.344358,-150.012379,1.935083,48.113344,-0.146594,8.700739,-87.956111
780,3905,1.438345,0.536568,0.958903,0.164880,-0.000717,0.053917,-0.430454,0.076427,-0.580191,...,0.071671,7.192561,43.002303,58.041427,-51.905231,0.932313,26.523530,0.066879,5.021503,40.127611


In [11]:
ft_test = pd.merge(ft_test,f_,how="left")

### - 트랜스포머의 인코더에 추가로 넣을 설명변수 생성

- datamanim 님이 코드 공유 게시판에 공유해주신 설명변수 추가 (감사합니다.)

In [12]:
# 학습 데이터
train['acc_t']  =(train['acc_x']**2+train['acc_y']**2+train['acc_z']**2)**(1/3)
# 테스트 데이터
test['acc_t']  =(test['acc_x']**2+test['acc_y']**2+test['acc_z']**2)**(1/3)

- 각 변수의 차분값을 구하여 설명변수를 생성 (첫번째 값은 0으로 대체)

In [13]:
# 학습 데이터
f_ = train.groupby("id").progress_apply(
    lambda x : x.iloc[:,2:].diff().fillna(0)
).add_prefix("diff_")

train = pd.concat([train,f_],axis=1)

# 테스트 데이터
f_ = test.groupby("id").progress_apply(
    lambda x : x.iloc[:,2:].diff().fillna(0)
).add_prefix("diff_")

test = pd.concat([test,f_],axis=1)

100%|██████████| 3125/3125 [00:01<00:00, 2181.75it/s]
100%|██████████| 782/782 [00:00<00:00, 2226.19it/s]


### - 스케일 조정

In [14]:
ft_sc = StandardScaler()
train.iloc[:,2:] = ft_sc.fit_transform(train.iloc[:,2:]) # 학습 데이터
test.iloc[:,2:] = ft_sc.transform(test.iloc[:,2:]) # 테스트 데이터

In [15]:
ft_sc = StandardScaler()
ft_train = ft_sc.fit_transform(ft_train.iloc[:,1:]) # 학습 데이터
ft_test = ft_sc.transform(ft_test.iloc[:,1:]) # 테스트 데이터

### - 학습및 테스트 데이터 세팅

In [16]:
ft_cnt = train.iloc[:,2:].columns.shape[0] # 설명변수 개수
X_train = np.array(train.iloc[:,2:])
X_test = np.array(test.iloc[:,2:])

#차원 변경
X_train = X_train.reshape(-1, 600, ft_cnt)
X_test=X_test.reshape(-1, 600, ft_cnt)

y = tf.keras.utils.to_categorical(train_labels['label']) 

X_train.shape , X_test.shape , y.shape

((3125, 600, 14), (782, 600, 14), (3125, 61))

<font color="red"><Br>
# 3.학습및 예측

- 다음의 링크를 참고 하여 Transformer를 활용 하였습니다.

https://keras.io/examples/nlp/text_classification_with_transformer/

### - 모델링

In [17]:
def transformer_block(inputs,node,drop_rate,activation):
    attn_output = keras.layers.MultiHeadAttention(num_heads=2, 
                                        key_dim=node)(inputs, inputs)
    attn_output = keras.layers.Dropout(drop_rate)(attn_output)
    out1 = keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)
    ffn_output = keras.layers.Dense(node, activation=activation)(out1) #
    ffn_output = keras.layers.Dense(node)(ffn_output) #
    ffn_output = keras.layers.Dropout(drop_rate)(ffn_output)
    return keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

- 총 3개의 인풋을 받아 학습을 진행
- 2개의 인풋은 CNN 을 거쳐 트랜스포머의 인코더의 인풋으로 들어감
- 1개의 인풋은 별도의 간단한 히든레이어에 들어감
- 3개의 아웃풋을 에버리지 레이어를 통과후 소프트맥스를 통해 최종 예측값 생성

In [18]:
def my_dnn_model(node=64,activation='relu', drop_rate = 0.2 ,loss="categorical_crossentropy", 
                 optimizer="rmsprop",metrics=['accuracy']):
    
    avg_list = []
    inputs_list = []
    for i in range(3):
        if i < 2:
            inputs = keras.Input(shape=(600, 7))

            x = keras.layers.Conv1D(node*2, 5, activation=activation)(inputs)
            x = keras.layers.MaxPooling1D(3)(x)
            x = keras.layers.Dropout(drop_rate)(x)
            x = keras.layers.Conv1D(node, 5, activation=activation)(x)
            x = keras.layers.MaxPooling1D(3)(x)
            x = keras.layers.Dropout(drop_rate)(x) 

            positions = tf.range(start=0, limit=x.shape[1], delta=1,dtype="float32") 
            positions = keras.layers.Embedding(input_dim=x.shape[1], output_dim=node)(positions)
            x = x + positions

            x = transformer_block(x,node,drop_rate,activation)
            x = keras.layers.GlobalMaxPooling1D()(x)
            x = keras.layers.Dropout(drop_rate)(x)
            avg_list.append(x)
        else:
            inputs = keras.Input(shape=(42,))
            x = inputs
            x = keras.layers.Dense(node, activation=activation)(x)
            x = keras.layers.Dropout(drop_rate)(x)
            x = keras.layers.Dense(node, activation='softmax')(x)
            avg_list.append(x)
            
        inputs_list.append(inputs)
    
    x = keras.layers.Average()(avg_list)
    
    outputs = keras.layers.Dense(61, activation='softmax')(x)
    model = keras.Model(inputs=inputs_list, outputs=outputs)
    model.compile(loss=loss, optimizer=optimizer,metrics=metrics)
    return model

In [19]:
model = my_dnn_model()
model.summary()

AttributeError: module 'tensorflow.keras.layers' has no attribute 'MultiHeadAttention'

- DACON.Dobby님이 공유해주신 증강코드를 수정하여 함수로 생성(감사합니다.)

In [23]:
def aug_data(data , data_name, n=0 ,shift=False,list_ = False):
    """데이터 증강 함수
            Args:
                data (numpy array): 증강할 데이터
                data_name (str): print 용
                n (int):  증강 데이터 세트수
                shift (bool): shift 사용 여부
                list_ (bool):  list 묶음으로 반환 여부
            Returns: 
                numpy array or list:
    """
    data_ = data.copy()
    if list_:
        data_ = [data]
    print(f"##### {data_name} 데이터 {n} 개 증강... #####")
    for _ in range(n):
        if shift:
            shift_n = int(random.random()*600)
            print(f"shift num : {shift_n}")
            r_idx = np.roll(np.arange(600), shift_n)
            
            if list_:
                data_.append(np.array(data[:,r_idx], np.float32))
            else:
                data_ = np.concatenate(  ( data_, np.array(data[:,r_idx], np.float32) ),axis=0 )
        else:
            if list_:
                data_.append(data)
            else:
                data_ = np.concatenate(  ( data_, data ),axis=0 )
    print("# 완료!!")    
    return data_

### - 학습및 예측

- 학습데이터를 증강하여 학습진행(검증데이터 증강 X)
- 검증은 CV 각 폴드 과정에서 검증데이터 증강후에 데이터 각 세트에 대해 예측을 진행후 나온 예측값을 산술평균후에 logloss 값 확인 및 저장
- 테스트 데이터에 대한 각 폴드에 예측값의 경우도 검증과 동일하게 예측값 생성
- CV 5fold 로 생성된 예측값을 산술평균 후에 최종 예측값으로 생성

In [24]:
idx = int(X_train.shape[-1] / 2) # 차분한 설명변수들을 구분하는 인덱스
holdout_break = False
aug_n = 10 # 증강 데이터 세트수

final_pred_list = [] # 최종 예측값 리스트
log_loss_list = [] # logloss 스코어 리스트

# 테스트 데이터 증강
reset_seeds(SEED)    
X_test_aug = aug_data(X_test,"X_test" ,n=aug_n,shift=True,list_=True)
ft_test_aug = aug_data(ft_test,"ft_test",n=aug_n,list_=True)

model_idx = 0

kf = KFold(n_splits=5,random_state=0,shuffle=True)
for tri, tei in kf.split(X_train,y):
    reset_seeds(SEED)
    early_stop = EarlyStopping(monitor='val_loss', patience=7)
    mc = ModelCheckpoint(f'best_model{model_idx}.h5', monitor = 'val_loss', mode = 'min', 
                     verbose = 1, save_best_only = True)
    # model = my_dnn_model()
    model
    
    #학습데이터 증강
    X_tri = aug_data(X_train[tri],"X_train[tri]",n=aug_n,shift=True)
    y_tri = aug_data(y[tri],"y[tri]",n=aug_n)
    ft_train_tri = aug_data(ft_train[tri],"ft_train[tri]",n=aug_n)
    tri_list = [ X_tri[:,:,:idx] , X_tri[:,:,idx:] ,  ft_train_tri ]
    
    #검증데이터는 증강 X
    tei_list = [ X_train[tei,:,:idx] , X_train[tei,:,idx:] , ft_train[tei] ]
    
    with tf.device("/CPU:0"):
        history = model.fit(tri_list, y_tri , epochs=100, batch_size=128,callbacks=[early_stop,mc],
                            validation_data=(tei_list, y[tei]),
                            )
    
    # 검증 데이터 증강후 각 세트를 예측하여 산술평균후에 logloss 값을 확인
    reset_seeds(SEED)
    X_tei_aug = aug_data(X_train[tei],"X_train[tei]",n=aug_n,shift=True,list_=True)
    ft_train_tei_aug = aug_data(ft_train[tei],"ft_train[tei]",n=aug_n,list_=True)
    aug_preds = []
    loaded_model = load_model(f'best_model{model_idx}.h5') # 베스트 모델 로드
    for X_aug,ft_aug in zip(X_tei_aug,ft_train_tei_aug):
        aug_preds.append(
            loaded_model.predict([ X_aug[:,:,:idx] , X_aug[:,:,idx:] ,ft_aug ])
        )
    aug_preds = np.mean(aug_preds,axis=0)
    score = log_loss( y[tei] , aug_preds )
    log_loss_list.append(score)
    print(f"log_loss : {score}")
    
    # 증강된 테스트 데이터 각 세트를 예측하여 산술평균후 저장
    aug_preds = []
    for X_aug,ft_aug in zip(X_test_aug,ft_test_aug):
        aug_preds.append(
            loaded_model.predict([ X_aug[:,:,:idx] , X_aug[:,:,idx:] ,ft_aug ])
        )
    aug_preds = np.mean(aug_preds,axis=0)
    final_pred_list.append(aug_preds)
    
    
    
    model_idx += 1
    if holdout_break:
        break

RANDOM SEEDS RESET 1
##### X_test 데이터 10 개 증강... #####
shift num : 348
shift num : 116
shift num : 579
shift num : 554
shift num : 280
shift num : 398
shift num : 128
shift num : 133
shift num : 173
shift num : 415
# 완료!!
##### ft_test 데이터 10 개 증강... #####
# 완료!!
RANDOM SEEDS RESET 1


NameError: name 'model' is not defined

In [25]:
print(f"log_loss mean: {np.array(log_loss_list).mean()}")
print(f"log_loss std: {np.array(log_loss_list).std()}")
final_pred = np.mean(final_pred_list,axis=0)
log_loss_list

log_loss mean: 0.5670415227912308
log_loss std: 0.06273689175510266


[0.5389355347841017,
 0.5626589923185297,
 0.4677849093673489,
 0.6459557451124885,
 0.6198724323736853]

### - 예측파일 내보내기

In [27]:
final_submit = submission.copy()
final_submit.iloc[:,1:]=final_pred  
final_submit

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,51,52,53,54,55,56,57,58,59,60
0,3125,0.000032,2.949999e-06,5.454564e-04,9.505763e-05,8.205664e-04,4.487088e-06,2.063583e-03,7.748483e-06,1.429730e-05,...,1.455515e-03,9.858183e-04,3.187731e-04,1.422472e-04,3.740682e-04,3.997574e-06,9.240726e-08,8.820836e-03,2.654129e-06,8.349844e-06
1,3126,0.000482,2.844041e-08,5.712100e-09,2.741548e-05,4.145272e-06,1.822872e-05,6.428497e-08,7.060175e-07,6.568233e-06,...,6.431568e-08,1.870099e-10,1.665346e-07,8.111397e-08,9.185113e-07,5.942839e-08,4.254139e-08,1.043982e-07,1.151865e-08,4.035822e-06
2,3127,0.000520,2.956220e-02,5.332585e-05,1.267966e-03,3.235794e-04,1.412775e-03,7.689697e-02,1.915085e-04,1.809220e-02,...,3.021594e-04,3.923201e-05,5.857303e-05,3.292178e-03,4.637424e-05,4.203224e-04,7.404757e-07,2.774473e-04,4.495524e-03,3.666292e-04
3,3128,0.000149,6.226859e-08,2.512742e-05,1.053882e-04,3.945113e-09,1.377589e-05,1.964818e-10,1.096983e-05,2.022974e-07,...,3.561495e-08,9.028579e-09,2.420567e-07,1.945196e-06,1.174241e-04,5.800024e-07,2.547677e-05,1.284713e-07,3.553360e-10,1.274501e-04
4,3129,0.000242,2.187010e-08,1.420652e-09,1.135435e-06,9.057795e-05,2.439244e-06,2.743650e-11,2.910950e-08,4.038291e-07,...,8.178026e-11,4.656192e-12,1.370558e-09,1.270387e-11,4.026572e-09,1.740860e-09,6.382703e-05,4.002628e-08,3.543273e-09,3.630392e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,3902,0.002311,6.777510e-08,4.310060e-09,2.600348e-05,3.111136e-03,9.632684e-06,2.342603e-09,4.804718e-07,2.345243e-06,...,1.455163e-07,1.156058e-08,5.910046e-07,2.400488e-09,5.769946e-08,2.046839e-08,3.052598e-04,2.454884e-07,2.734131e-07,3.882496e-03
778,3903,0.000227,6.194850e-08,2.102190e-10,2.686587e-05,6.421692e-06,9.165403e-06,8.362011e-11,2.794029e-07,5.227568e-07,...,3.781473e-10,3.312374e-12,4.619389e-08,1.884658e-10,4.821302e-09,3.378671e-08,2.713538e-06,1.097389e-08,2.122869e-09,4.362917e-06
779,3904,0.000066,3.483316e-10,5.987153e-09,1.752979e-06,1.410900e-05,2.548866e-07,1.765932e-10,4.484379e-08,1.617948e-08,...,2.856205e-08,1.565856e-09,1.698061e-08,9.572945e-10,3.082543e-08,1.588561e-10,2.720129e-06,1.395063e-07,1.090977e-11,1.836785e-05
780,3905,0.000093,5.785850e-04,7.726508e-05,3.684409e-05,6.816952e-06,1.483243e-05,1.723245e-02,1.402121e-06,2.424638e-06,...,1.175128e-05,7.907505e-06,3.401846e-06,8.223099e-04,5.526348e-05,1.416740e-04,2.487597e-08,2.797057e-05,5.737526e-04,4.763487e-07


In [28]:
final_submit.to_csv(f'{SUB_PATH}submission_{np.array(log_loss_list).mean()}.csv', index=False)
print("끝!!")

끝!!


# End.