In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings(action='ignore')

from math import atan, sqrt
from sklearn.preprocessing import StandardScaler,MinMaxScaler


import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers as L
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Lambda, Input, Dense, Reshape, Dropout, RepeatVector, LSTM, TimeDistributed, BatchNormalization, Bidirectional
# import tensorflow_addons as tfa
from keras.utils import to_categorical
from keras import backend as K 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from sklearn.model_selection import KFold,StratifiedKFold
from numpy.random import seed
import keras

from transforms3d.axangles import axangle2mat

# 1. 데이터 불러오기

In [2]:
train=pd.read_csv('Data/DACON_train.csv')
train_labels=pd.read_csv('Data/train_labels_reduce.csv')

In [3]:
#acc_x,y,z 칼럼만 가져오기

train_acc = train.iloc[:, 2:5]

In [4]:
train_acc['X'] *=  9.8
train_acc['Y'] *=  9.8
train_acc['Z'] *=  9.8

# 1-1. 테스트할 데이터 불러오기


In [5]:
origin_df = pd.read_csv('Data/303_1008.csv').set_index('Time', drop=True)
origin_df

Unnamed: 0_level_0,X,Y,Z
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-10-08 00:00:00,4.390830,7.597212,-4.522435
2020-10-08 00:00:01,4.433900,7.568498,-4.546363
2020-10-08 00:00:02,4.467400,7.590033,-4.532006
2020-10-08 00:00:03,4.366901,7.587641,-4.543970
2020-10-08 00:00:04,4.453043,7.623533,-4.543970
...,...,...,...
2020-10-08 23:59:55,6.001199,5.051248,5.790630
2020-10-08 23:59:56,6.022735,5.005784,5.850452
2020-10-08 23:59:57,5.986843,5.089533,5.737989
2020-10-08 23:59:58,5.955735,5.060820,5.819345


In [6]:
# column 추가
origin_df['SVM'] = (origin_df['X']**2 + origin_df['Y']**2 + origin_df['Z']**2)**(1/2)
origin_df['roll'] = (origin_df.iloc[:,1]/(origin_df.iloc[:,0]**2 + origin_df.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi
origin_df['pitch'] = (origin_df.iloc[:,0]/(origin_df.iloc[:,1]**2 + origin_df.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi

In [7]:
# id 생성
origin_df.index = origin_df.reset_index(drop=True).index

origin_df['id'] = origin_df.index//10

# origin_df.index = origin_df.index -1

In [8]:
# 1Hz 기준이므로 1초로 미분
dt=1
def jerk_signal(signal): 
        return np.array([(signal[i+1]-signal[i])/dt for i in range(len(signal)-1)])

In [9]:
train_dt=[]

for i in origin_df['id'].unique():
    temp=origin_df.loc[origin_df['id']==i]
    for v in origin_df.columns[:3]:
        values=jerk_signal(temp[v].values)
        values=np.insert(values,0,0)
        # 0초의 변화량은 0임으로 0으로 설정 
        temp.loc[:,v+'_diff']=values
    train_dt.append(temp)
    
train2=pd.concat(train_dt)

# 컬럼 재정렬
train2 = train2[['id','X','Y','Z','SVM','roll','pitch','X_diff','Y_diff','Z_diff']]

2-5. StandardScaling

In [10]:
col=train2.columns
train_s=train2.copy()
# test_s=test.copy()

In [11]:
train_s

Unnamed: 0,id,X,Y,Z,SVM,roll,pitch,X_diff,Y_diff,Z_diff
0,0,4.390830,7.597212,-4.522435,9.871648,50.317909,26.410029,0.000000,0.000000,0.000000
1,0,4.433900,7.568498,-4.546363,9.879831,50.000981,26.665632,0.043071,-0.028714,-0.023928
2,0,4.467400,7.590033,-4.532006,9.904815,50.022551,26.809991,0.033500,0.021535,0.014357
3,0,4.366901,7.587641,-4.543970,9.863559,50.287484,26.278249,-0.100499,-0.002393,-0.011964
4,0,4.453043,7.623533,-4.543970,9.929527,50.153516,26.645226,0.086142,0.035892,0.000000
...,...,...,...,...,...,...,...,...,...,...
86754,8675,6.001199,5.051248,5.790630,9.749918,31.203622,37.989191,0.057427,0.050249,-0.059821
86755,8675,6.022735,5.005784,5.850452,9.775428,30.802363,38.032587,0.021535,-0.045464,0.059821
86756,8675,5.986843,5.089533,5.737989,9.729858,31.539349,37.974175,-0.035892,0.083749,-0.112463
86757,8675,5.955735,5.060820,5.819345,9.744098,31.290190,37.677378,-0.031108,-0.028714,0.081356


In [13]:
#### train에서 fit_transform, test 에서는 transform 
# Standard Scaler
# 평균을 제거하고 데이터를 단위 분산으로 조정한다. 그러나 이상치가 있다면 평균과 표준편차에 영향을 미쳐 변환된 데이터의 확산은 매우 달라지게 된다.
# 따라서 이상치가 있는 경우 균형 잡힌 척도를 보장할 수 없다.

# MinMax Scaler
# 모든 feature 값이 0~1사이에 있도록 데이터를 재조정한다. 다만 이상치가 있는 경우 변환된 값이 매우 좁은 범위로 압축될 수 있다.
# 즉, MinMaxScaler 역시 아웃라이어의 존재에 매우 민감하다.

scaler = StandardScaler()

train_s.iloc[:,1:]= scaler.fit_transform(train_s.iloc[:,1:])
train_sc = pd.DataFrame(data = train_s,columns =col)

In [14]:
# window 생성 및 slide 사이즈에 따른 window 이동, 생성
def process_window(signal, size, slide=None):
    
    # slide 사이즈 미설정시
    if slide is None:

        slide = 4

    x = []

    for start in range(0, len(signal)-size, slide):

        end = start + size

        x.append(signal[start:end])
        

        
    X= []
    if len(x) > 0:
        X.append(x)
    else:
        return np.array(x)
    
    X = np.row_stack(x)
    
    x = np.expand_dims(X, 2)
        
    
    return x

In [15]:
# Dacon 데이터와 동일하게 10초로 설정
x_train = process_window(train_sc.iloc[:,1:], 10, 10)
x_train = x_train.reshape(-1, 10, 9)

# 2. 모델링 및 학습

In [16]:
# %pip install tensorflow_addons

2-2. GRU 모델 구성

In [17]:
def gru_model(input_shape, classes):
    seed(2021)
    tf.random.set_seed(2021)
    
    input_layer = tf.keras.layers.Input(input_shape)
#     gru1 = L.LSTM(128, return_sequences = True, dropout = 0.5,kernel_regularizer=tf.keras.regularizers.L1(0.05))(input_layer)
    gru1 = L.LSTM(64, return_sequences = True, dropout = 0.4)(input_layer)
    mp = L.MaxPool1D(padding='same')(gru1)
    ap = L.AveragePooling1D(padding='same')(gru1)
    concat1 = L.Concatenate()([mp, ap])
    
    gru2 = L.LSTM(32, return_sequences = True, dropout = 0.3)(concat1)
    mp2 = L.MaxPool1D(padding='same')(gru2)
    ap2 = L.AveragePooling1D(padding='same')(gru2)
    concat2 = L.Concatenate()([mp2, ap2])
    
    gru3 = L.LSTM(16, return_sequences = True)(concat2)
    GAP = L.GlobalAveragePooling1D()(gru3)

    dense = L.Dense(classes, activation = "softmax")(GAP)
    
    model = tf.keras.models.Model(input_layer, dense)
    
    optimizer = tf.keras.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer = optimizer, 
                  metrics=['accuracy'])
    
    return model

In [18]:
def Rotation(X):
    axis = np.random.uniform(low=-1, high=1, size=X.shape[1])
    angle = np.random.uniform(low=-np.pi, high=np.pi)
    return np.matmul(X , axangle2mat(axis,angle))

def get_svm(data):
    acc_svm = (data[0]**2+data[1]**2+data[2]**2)**(1/2)
    svm = pd.DataFrame(acc_svm)
    svm.columns = ['acc_svm']
    return svm

def get_roll_pitch(data):
    roll = (data.iloc[:,1]/(data.iloc[:,0]**2 + data.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi
    pitch = (data.iloc[:,0]/(data.iloc[:,1]**2 + data.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi
    roll_pitch = pd.concat([roll,pitch],axis=1)
    roll_pitch.columns = ['acc_roll','acc_pitch']
    return roll_pitch

def setting(data, data_, case = 0):
    if case == 0:
        for i in range(0, data.shape[0], 10):  #id 별
            data[i] = data_[i] - data_[i+9]
    else:
        for i in range(0, data.shape[0], 10):
            data[i: i+5] = data_[i: i+5].values - data_[i+4:i+9].values
    return data


def get_diff(data, case = 0):
    if case == 0:
        x_diff, y_diff, z_diff = data.iloc[:, 0].diff(), data.iloc[:, 1].diff(), data.iloc[:, 2].diff()
    else:
        x_diff, y_diff, z_diff = data.iloc[:, 0].diff(5), data.iloc[:, 1].diff(5), data.iloc[:, 2].diff(5)   #5행떨어진 값의 차분
    
    diff = pd.concat([setting(x_diff, data.iloc[:, 0], case),
                      setting(y_diff, data.iloc[:, 1], case),
                      setting(z_diff, data.iloc[:, 2], case)], axis= 1)
    diff.columns = ['X_diff','Y_diff','Z_diff']
    return diff
    

In [19]:
def train_dataset(acc_data):
    
    svm_acc1 = get_svm(acc_data)
    roll_pitch_acc1 = get_roll_pitch(acc_data)
    diff_acc1 = get_diff(acc_data)
    
    aa = pd.concat([svm_acc1, roll_pitch_acc1, diff_acc1],axis=1)
#     aa.columns = ['svm', 'roll','pitch']
    aa.columns = ['svm', 'roll','pitch','X_diff','Y_diff','Z_diff']
    return aa

In [20]:
train_acc_ = np.array(train_acc).reshape(-1, 10, 3)

In [21]:
train_acc_.shape

(3125, 10, 3)

In [22]:
y = train_labels['label'].values
y = tf.keras.utils.to_categorical(train_labels['label']) 
y.shape

(3125, 11)

In [25]:
# Kfold 를 이용해서 검증 데이터를 옮겨가면서 지속적인 학습을 진행 
skf = StratifiedKFold(n_splits = 5, random_state = 2048, shuffle = True)

# keras의 콜백함수인 ReduceLROnPlateau는 학습률이 개선되지 않을 때, 
# 학습률을 동적으로 조정하여 학습률을 개선하는 효과를 기대할 수 있습니다.
reLR = ReduceLROnPlateau(monitor = 'loss', patience = 5,verbose = 1,factor = 0.8) 

# loss값이 개선되지 않는 일정 점에서 동작을 정지하여 리소스 확보를 돕는다.
es = EarlyStopping(monitor='loss', patience=10, mode='min')

window = 10
columns = 9

accuracy = []
losss = []
models = []


# fold 에 따른 각각의 값 loss, accuracy, model 을 저장하기 위한 list 선언
for i, (train, validation) in enumerate(skf.split(train_acc_, y.argmax(1))) :
    
    print("-" * 20 +"Fold_"+str(i+1)+ "-" * 20)
    
    # std scaler 선언
    scaler = StandardScaler()
    
    # =================================================================== 
    
    # 받은 데이터를 DataFrame 으로 변형
    x_ = pd.DataFrame(train_acc_[train].reshape(-1, 3))
    
    # 원본 데이터와 추가 column을 합치기
    x__ = pd.concat([x_, train_dataset(x_)], axis=1)
    
    # scaling
    x__= scaler.fit_transform(x__).reshape(-1, window, columns)
    
    # 정답 label 설정
    y_ = pd.DataFrame(y[train])
         
    # =================================================================== 

    # 위의 training dataset 과 동일한 과정
    
    
    a_ = pd.DataFrame(train_acc_[validation].reshape(-1, 3))
    
    
    a__ = pd.concat([a_, train_dataset(a_)], axis=1)
    
    a__= scaler.transform(a__).reshape(-1, window, columns)
    
    b_ = pd.DataFrame(y[validation])
    
    
    
    

    
    # =================================================================== 
    
    # 모델 생성
    # input_shape, classes를 입력해주어 함수 실행 위에 CNN 모델링을 토대로한 
    # 딥러닝 진행 
    
    model = gru_model((window,columns), 11)
    
        
    history = model.fit(x__, y_,
                        epochs = 500, 
                        verbose=0,
                        batch_size=128,
                        callbacks=[es, reLR])
    
    k_accuracy = '%.4f' % (model.evaluate(a__, b_)[1])
    k_loss = '%.4f' % (model.evaluate(a__, b_)[0])
    

    accuracy.append(k_accuracy)
    losss.append(k_loss)
    models.append(model)

# fold 진행하고 나서 나온 값을 위에 선언한 List에 appand한다. 

print('\nK-fold cross validation Auc: {}'.format(accuracy))
print('\nK-fold cross validation loss: {}'.format(losss))

--------------------Fold_1--------------------

Epoch 00083: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.

Epoch 00097: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.

Epoch 00102: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.

Epoch 00109: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.

Epoch 00124: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.

Epoch 00136: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.

Epoch 00145: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.

Epoch 00150: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
--------------------Fold_2--------------------

Epoch 00082: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.

Epoch 00095: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.

Epoch 00116: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.

Epoch 00144:

In [26]:
# 전체 평균 정확도와 손실값 출력
print(sum([float(i) for i in accuracy])/5)
print()
print(sum([float(i) for i in losss])/5)

0.8000399999999999

0.64336


2-3. accuracy / loss plot

#4. predict

In [27]:
preds = []
for model in models:
    pred = model.predict(x_train)
    preds.append(pred)
pred = np.mean(preds, axis=0)

In [28]:
df = pd.DataFrame(pred)

In [29]:
temp_list = []
for i in range(len(df)):
  temp_list.append(df.iloc[i].argmax())

In [30]:
df = pd.DataFrame(temp_list)

In [31]:
desc_df = train_labels.drop_duplicates('label')
desc_df

Unnamed: 0,id,label,label_desc
0,0,6,moving_type(chest press)
1,1,0,ambiguous
2,2,2,arm_moving_biceps
12,12,3,Statit
22,22,4,dynamic
23,23,9,Running (treadmill)
26,26,8,moving_type(Squrt)
30,30,5,moving_type(sit up)
49,49,1,moving_type(rowing_machine)
52,52,10,moving_type(Triceps Kickback)


In [32]:
desc_df[desc_df['label']==pred.argmax(1)[8674]]['label_desc'].values[0]

'Statit'

In [33]:
final_df = origin_df.copy()

for i in final_df['id'].unique():
    
    try:
        final_df.loc[final_df.id == i, 'Dacon_label'] = pred.argmax(1)[i]
        final_df.loc[final_df.id == i, 'Dacon_desc'] = desc_df[desc_df['label']==pred.argmax(1)[8674]]['label_desc'].values[0]
    except:
        final_df.loc[final_df.id == i, 'Dacon_label'] = None
        final_df.loc[final_df.id == i, 'Dacon_desc'] = None

In [34]:
# for i in final_df['id'].unique():
#     final_df[final_df['id'] == int(i)]['label'] = int(pd.DataFrame(pred.argmax(1))[0][i])
final_df.head(20)

Unnamed: 0,X,Y,Z,SVM,roll,pitch,id,Dacon_label,Dacon_desc
0,4.39083,7.597212,-4.522435,9.871648,50.317909,26.410029,0,3.0,Statit
1,4.4339,7.568498,-4.546363,9.879831,50.000981,26.665632,0,3.0,Statit
2,4.4674,7.590033,-4.532006,9.904815,50.022551,26.809991,0,3.0,Statit
3,4.366901,7.587641,-4.54397,9.863559,50.287484,26.278249,0,3.0,Statit
4,4.453043,7.623533,-4.54397,9.929527,50.153516,26.645226,0,3.0,Statit
5,4.323831,7.661818,-4.587041,9.921689,50.554494,25.835944,0,3.0,Statit
6,4.400401,7.649854,-4.522435,9.916461,50.482457,26.343207,0,3.0,Statit
7,4.414758,7.554141,-4.54397,9.859147,50.014427,26.601526,0,3.0,Statit
8,4.455436,7.573284,-4.565506,9.901988,49.891385,26.74071,0,3.0,Statit
9,4.383651,7.659425,-4.508078,9.909892,50.615661,26.253983,0,3.0,Statit


In [None]:
# pd.DataFrame(temp_list).iloc[:50].plot(subplots=True, layout=(1,2))
# origin_df.iloc[:500, :3].plot(subplots=True, layout=(2,2))
# plt.show()
# print(pd.DataFrame(temp_list).iloc[:50])

term = 50

for (t1, t2) in zip(range(0, len(temp_list), term), range(0, len(origin_df), term*10)):
    plt.figure(1)
    plt.rcParams["figure.figsize"] = (15,5)
    plt.subplot(2,1,1)
    pd.DataFrame(temp_list).iloc[t1:t1+term, :].plot(ax=plt.gca(), title = f'{t1}:{t1+50}') #no need to specify for first axis
    plt.subplot(2,1,2)
    origin_df.iloc[t2:t2+term*10, :3].plot(ax=plt.gca(), title = f'{t2}:{(t2+500)}')
    plt.show()
    
    print("==============================================================================================================")