In [1]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 기본 directory 설정
import os
os.chdir('/content/drive/MyDrive/Monthly_Workout')

In [1]:
# 모듈 불러오기
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from math import pi

In [4]:
# 데이터 불러오기
path = './data/' # 기본 directory 경로에 추가 할 경로

train = pd.read_csv(path + 'train_features.csv')
train_labels = pd.read_csv(path + 'train_labels.csv')
test = pd.read_csv(path + 'test_features.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [5]:
act_list=train.iloc[:,2:].columns
acc_list=['acc_x','acc_y','acc_z']
gy_list=['gy_x','gy_y','gy_z']
act_list

Index(['acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z'], dtype='object')

In [6]:
# acc 데이터와 gy 데이터로 분할
def sensor_split(data):
    X_acc = []
    X_gy = []

    for i in tqdm(data['id'].unique()):
        temp_acc = np.array(data[data['id'] == i].loc[:,acc_list])
        temp_gy = np.array(data[data['id'] == i].loc[:,gy_list])
        X_acc.append(temp_acc)
        X_gy.append(temp_gy)
      
    X_acc = np.array(X_acc).reshape(-1,600,3)
    X_gy = np.array(X_gy).reshape(-1,600,3)

    return X_acc, X_gy

In [7]:
# 데이터 증강

def permutation(X, nPerm=4, minSegLength=10):
    X_new = np.zeros(X.shape)
    idx = np.random.permutation(nPerm)
    bWhile = True
    while bWhile == True:
        segs = np.zeros(nPerm+1, dtype=int)
        segs[1:-1] = np.sort(np.random.randint(minSegLength, X.shape[0]-minSegLength, nPerm-1))
        segs[-1] = X.shape[0]
        if np.min(segs[1:]-segs[0:-1]) > minSegLength:
            bWhile = False
    pp = 0
    for ii in range(nPerm):
        x_temp = X[segs[idx[ii]]:segs[idx[ii]+1],:]
        X_new[pp:pp+len(x_temp),:] = x_temp
        pp += len(x_temp)
    return (X_new)

def aug(data, uid, shift):
    shift_data = np.roll(data[uid], shift, axis=0)
    return shift_data
def rolling(data):
    aug_data=[]
    for i in range(data.shape[0]):
        temp=list((aug(data,i,int(random.random()*600))))
        aug_data.append(temp)
    return np.array(aug_data)

In [8]:
# 데이터 증강 (반복하고 싶은 만큼 조정)
def start_augmentation(train, train_labels):
    # acc, gy 데이터 분할
    X_train_mod=pd.merge(train,train_labels,how='left',on='id')
    X_train_acc, X_train_gy= sensor_split(X_train_mod)

    # 증강시키고 추가할 임시 데이터 복사본
    X_train_acc_temp = X_train_acc.copy()
    X_train_gy_temp = X_train_gy.copy()

    # label 데이터 변환
    y_train = train_labels['label']
    y_train_total = np.append(y_train, y_train, axis=0)

    rep = 3 # 5이상의 경우 reshape 과정에서 reset될 가능성 높음
    for i in range(rep):
        X_train_acc_roll = rolling(X_train_acc_temp)
        X_train_acc_rp = permutation(rolling(X_train_acc_temp)) # rolling + permutation

        X_train_gy_roll = rolling(X_train_gy_temp)
        X_train_gy_rp = permutation(rolling(X_train_gy_temp)) # rolling + permutation

        # 증강시킨 데이터 원래 데이터에 추가
        X_train_acc = np.append(X_train_acc, X_train_acc_roll, axis=0)
        X_train_acc = np.append(X_train_acc, X_train_acc_rp, axis=0)

        X_train_gy = np.append(X_train_gy, X_train_gy_roll, axis=0)
        X_train_gy = np.append(X_train_gy, X_train_gy_rp, axis=0)

        y_train_total = np.append(y_train_total, y_train, axis=0)
        if i != (rep-1): # 마지막 한 번 제외
            y_train_total = np.append(y_train_total, y_train, axis=0)

    return X_train_acc, X_train_gy, y_train_total 

In [9]:
X_train_acc, X_train_gy, y_train_total = start_augmentation(train, train_labels)

X_train_acc.shape, X_train_gy.shape, y_train_total.shape

100%|██████████| 3125/3125 [00:19<00:00, 157.52it/s]


((21875, 600, 3), (21875, 600, 3), (21875,))

In [10]:
# np array 형태를 dataframe 으로 변환
def np_to_df(X_train_acc, X_train_gy):
    acc = [e for sl in X_train_acc for e in sl]
    gy = [e for sl in X_train_gy for e in sl]

    df_report_acc = np.stack(acc, axis = 0)
    df_report_gy = np.stack(gy, axis = 0)

    df_acc = pd.DataFrame(df_report_acc, columns= ['acc_x', 'acc_y', 'acc_z']) 
    df_gy = pd.DataFrame(df_report_gy, columns= ['gy_x', 'gy_y', 'gy_z']) 

    # acc, gy 데이터프레임 병합
    df_aug_result = pd.concat([df_acc, df_gy], axis = 1)
    
    return df_aug_result

In [11]:
train = np_to_df(X_train_acc, X_train_gy)
train

Unnamed: 0,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112
1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216
2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629
3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013
4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234
...,...,...,...,...,...,...
13124995,-0.768373,-0.333116,0.319087,-1.333427,-24.402227,-17.841986
13124996,-0.817662,-0.347440,0.304618,-2.577915,-19.726341,-13.977766
13124997,-0.830871,-0.335602,0.270336,-2.948882,-16.471874,-7.634414
13124998,-0.799299,-0.334281,0.262900,-5.115464,-12.390227,-2.914647


In [12]:
# 가속도
train['acc_t'] = (train['acc_x'] ** 2) + (train['acc_y'] ** 2) + (train['acc_z'] ** 2) ** (1/3)
test['acc_t'] = (test['acc_x'] ** 2) + (test['acc_y'] ** 2) + (test['acc_z'] ** 2) ** (1/3)

train['gy_t'] = (train['gy_x'] ** 2) + (train['gy_y'] ** 2) + (train['gy_z'] ** 2) ** (1/3)
test['gy_t'] = (test['gy_x'] ** 2) + (test['gy_y'] ** 2) + (test['gy_z'] ** 2) ** (1/3)

# Signal 극대화 (peak 캐치 유용)
train['acc_mag'] = (train['acc_x'] ** 2) + (train['acc_y'] ** 2) + (train['acc_z'] ** 2)
test['acc_mag'] = (test['acc_x'] ** 2) + (test['acc_y'] ** 2) + (test['acc_z'] ** 2)

train['gy_mag'] = (train['gy_x'] ** 2) + (train['gy_y'] ** 2) + (train['gy_z'] ** 2)
test['gy_mag'] = (test['gy_x'] ** 2) + (test['gy_y'] ** 2) + (test['gy_z'] ** 2)

In [13]:
# vector
train['acc_vec'] = np.sqrt((train['acc_x'] ** 2) +(train['acc_y'] ** 2)+(train['acc_z'] ** 2))
test['acc_vec'] = np.sqrt((test['acc_x'] ** 2) +(test['acc_y'] ** 2)+(test['acc_z'] ** 2))

train['gy_vec'] = np.sqrt((train['gy_x'] ** 2) +(train['gy_y'] ** 2)+(train['gy_z'] ** 2))
test['gy_vec'] = np.sqrt((test['gy_x'] ** 2) +(test['gy_y'] ** 2)+(test['gy_z'] ** 2))

# 자이로스코프 무게중심
train['gy_gravity'] = (train['gy_x']+train['gy_y']+train['gy_z'])/3
test['gy_gravity'] = (test['gy_x']+test['gy_y']+test['gy_z'])/3

In [14]:
# roll & pitch
train['roll'] = np.arctan(train['acc_y']/np.sqrt(train['acc_x'] ** 2 + train['acc_z'] ** 2))
test['roll'] = np.arctan(test['acc_y']/np.sqrt(test['acc_x'] ** 2 + test['acc_z'] ** 2))

train['pitch'] = np.arctan(train['acc_x']/np.sqrt(train['acc_y'] ** 2 + train['acc_z'] ** 2))
test['pitch'] = np.arctan(test['acc_x']/np.sqrt(test['acc_y'] ** 2 + test['acc_z'] ** 2))

train['math_roll'] = np.arctan(- train['acc_x']/np.sqrt(train['acc_y'] ** 2 + train['acc_z'] ** 2)) * (180/pi)
test['math_roll'] = np.arctan(- test['acc_x']/np.sqrt(test['acc_y'] ** 2 + test['acc_z'] ** 2)) * (180/pi)

train['math_pitch'] = np.arctan(train['acc_y']/np.sqrt(train['acc_x'] ** 2 + train['acc_z'] ** 2)) * (180/pi)
test['math_pitch'] = np.arctan(test['acc_y']/np.sqrt(test['acc_x'] ** 2 + test['acc_z'] ** 2)) * (180/pi)

train['gy_roll'] = np.arctan(train['gy_y']/np.sqrt(train['gy_x'] ** 2 + train['gy_z'] ** 2))
test['gy_roll'] = np.arctan(test['gy_y']/np.sqrt(test['gy_x'] ** 2 + test['gy_z'] ** 2))

train['gy_pitch'] = np.arctan(train['gy_x']/np.sqrt(train['gy_y'] ** 2 + train['gy_z'] ** 2))
test['gy_pitch'] = np.arctan(test['gy_x']/np.sqrt(test['gy_y'] ** 2 + test['gy_z'] ** 2))

train['gy_math_roll'] = np.arctan(- train['gy_x']/np.sqrt(train['gy_y'] ** 2 + train['gy_z'] ** 2)) * (180/pi)
test['gy_math_roll'] = np.arctan(- test['gy_x']/np.sqrt(test['gy_y'] ** 2 + test['gy_z'] ** 2)) * (180/pi)

train['gy_math_pitch'] = np.arctan(train['gy_y']/np.sqrt(train['gy_x'] ** 2 + train['gy_z'] ** 2)) * (180/pi)
test['gy_math_pitch'] = np.arctan(test['gy_y']/np.sqrt(test['gy_x'] ** 2 + test['gy_z'] ** 2)) * (180/pi)

print(train.shape)
train

(13125000, 21)


In [None]:
# Scaling 원하는 걸로 사용
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

scaler = RobustScaler()
train = scaler.fit_transform(train)
test.drop(['id', 'time'], axis=1, inplace=True)
test = scaler.transform(test)
train

Error: Session cannot generate requests

In [16]:
import tensorflow as tf 
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Input
from keras.layers import TimeDistributed
from keras.layers import Activation, GlobalAveragePooling1D
from keras.layers import Dense, Flatten, BatchNormalization
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from keras.models import load_model
from keras.layers.merge import concatenate

In [17]:
len_features = train.shape[1] # feature 갯수
X = train.reshape(-1, 600, len_features)
X.shape

(28125, 600, 21)

In [18]:
y = to_categorical(y_train_total) 
y.shape

(28125, 61)

In [19]:
epochs, batch_size = 100, 128 # Ram 24GB 기준 256 이상 reset될 가능성 높음

In [20]:
test_X = test.reshape(-1, 600, len_features)
test_X.shape

(782, 600, 21)

In [None]:
epochs, batch_size = 0, 25, 64
n_features, n_outputs = X.shape[2], y.shape[1]
# reshape data into time steps of sub-sequences
n_steps, n_length = 6, 100
X = X.reshape((trainX.shape[0], n_steps, n_length, n_features))
test_X = test_X.reshape((test_X.shape[0], n_steps, n_length, n_features))

In [26]:
# 각종 사이즈는 다양하게 적용하여 stacking 쌓아 올리기
def define_model_0():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

def define_model_1():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_2():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=6, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_3():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=6, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(32))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_4():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(GlobalAveragePooling1D()))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_5():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=32, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=6, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(GlobalAveragePooling1D()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_6():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(GlobalAveragePooling1D()))
    model.add(LSTM(32))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def define_model_7():
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=6, activation='relu'), input_shape=(None,n_length,n_features)))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(GlobalAveragePooling1D()))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
for i in range(8):
    model = getattr(f'define_model_{i}')()
    checkpoint_path = "checkpoint/cp.ckpt"
    cp_callback = ModelCheckpoint(filepath=checkpoint_path, monitor='loss', 
                                verbose=1, save_weights_only=True, 
                                save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='loss', patience=30, mode='min')
    model.fit(X, y, epochs=epochs, batch_size=batch_size, 
            validation_split=0.2, callbacks=[early_stopping, cp_callback])
    model.save(f'models/model_{i}.h5')
    tf.keras.backend.clear_session()

In [None]:
# 저장한 모델 불러오기
for i in range(5):
    globals()[f'model{i}'] = load_model(f'models/model_{i}')

In [None]:
inputs = Input(shape=(600, n_features))

merge = concatenate([model0(inputs), model1(inputs), model2(inputs), model3(inputs), 
                    model4(inputs), model5(inputs), model6(inputs), model7(inputs)])
hidden = Dense(10, activation='relu')(merge)
output = Dense(61, activation='softmax')(hidden)

model = Model(inputs=inputs, outputs=output)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
checkpoint_path = "checkpoint/cp.ckpt"
cp_callback = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

early_stopping = EarlyStopping(monitor='val_loss', patience=30, mode='min')
model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_split=0.02, callbacks=[early_stopping, cp_callback_e])

In [None]:
prediction = model.predict(test_X)
prediction.shape

In [None]:
submission

In [None]:
submission.iloc[:,1:]=prediction

In [None]:
submission

In [None]:
submission.to_csv('submission/cnn_LSTM_stacked_5(jun).csv', index=False)