#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [552]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 머신 러닝 모델
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 딥러닝 모델
from keras.backend import clear_session
from keras.models import Sequential, Model
from keras.layers import Dense, Input, SimpleRNN, LSTM
from keras.optimizers import Adam

# 모델 평가
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from google.colab import drive
import joblib

In [553]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [554]:
project_path = '/content/drive/MyDrive/미니 프로젝트/'

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - data01_test.csv 를 불러와 'new_data' 이름으로 저장합니다.


In [555]:
data = pd.read_csv(project_path+'data01_train.csv')
new_data = pd.read_csv(project_path+'data01_test.csv')

In [556]:
data.drop('subject', axis=1, inplace=True)
new_data.drop('subject', axis=1, inplace=True)

## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [557]:
# data_copy = data.copy()
# new_data_copy = data.copy()
data['Activity_dynamic'] = data['Activity'].replace(['STANDING', 'SITTING', 'LAYING'], 0)
data.loc[data['Activity_dynamic']!=0,'Activity_dynamic'] = 1
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1


In [558]:
target1 = 'Activity'
target2 = 'Activity_dynamic'

# top 100 data
# ft_data = joblib.load(project_path + 'features_df.pkl')
# top100_feature = list(ft_data.sort_values(by='activity_importance', ascending=False)['feature_name'][:100].values)
# top100_feature = list(set(top100_feature))
# top100_feature += [target1, target2]
# data = data.loc[:, top100_feature]

# data
x = data.drop(columns=[target1, target2], axis=1)
y1 = data.loc[:, target1]
y2 = data.loc[:, target2].astype(int)
print(x.shape, y1.shape)

from sklearn.model_selection import train_test_split

x_train, x_test, y1_train, y1_test = train_test_split(x, y1, test_size=0.1)
x_train, x_test, y2_train, y2_test = train_test_split(x, y2, test_size=0.1)
print(x_train.shape, y1_test.shape)

(5881, 561) (5881,)
(5292, 561) (589,)


#### 스케일링

In [559]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
# joblib.dump(scaler, 'scaler.pkl')
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [560]:
algorithms = ['LR', 'KNN', 'DT', 'RF', 'XGB', 'LGBM', 'SVM', 'DL'] # 분류
result = pd.DataFrame({'name':algorithms})
result

Unnamed: 0,name
0,LR
1,KNN
2,DT
3,RF
4,XGB
5,LGBM
6,SVM
7,DL


In [561]:
y2_test.value_counts()

0    312
1    277
Name: Activity_dynamic, dtype: int64

In [562]:
# 성능 저장하는 함수
def get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    if not score_name in result.columns:
      result[score_name] = 0
    result.loc[result['name']==name, score_name] = accuracy
    print(f'{name} accuracy score : {accuracy}')
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

#### 1) 알고리즘1 : LR

In [563]:
score_name = 'Activity_dynamic'
name = 'LR'
model = LogisticRegression(max_iter=500)
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)
result

LR accuracy score : 0.99830220713073
[[311   1]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



Unnamed: 0,name,Activity_dynamic
0,LR,0.998302
1,KNN,0.0
2,DT,0.0
3,RF,0.0
4,XGB,0.0
5,LGBM,0.0
6,SVM,0.0
7,DL,0.0


#### 2) 알고리즘2 : KNN

In [564]:
name = 'KNN'
model = KNeighborsClassifier()
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)

KNN accuracy score : 1.0
[[312   0]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [565]:
# DT
name = 'DT'
model = DecisionTreeClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)

DT accuracy score : 0.99830220713073
[[311   1]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [566]:
# RF
name = 'RF'
model = RandomForestClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)
# joblib.dump(model, 'is_dynamic.pkl')

RF accuracy score : 0.99830220713073
[[311   1]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [567]:
# svm
name = 'SVM'
model = SVC(kernel='linear')
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)
# joblib.dump(model, 'is_dynamic_svm.pkl')

SVM accuracy score : 1.0
[[312   0]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [568]:
# xgb
name = 'XGB'
model = XGBClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)

XGB accuracy score : 0.99830220713073
[[311   1]
 [  0 277]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [569]:
# lgbm
name = 'LGBM'
model = LGBMClassifier(max_depth=5, verbose=-1)
get_score_dynamic(name ,model, x_train, y2_train, x_test, y2_test, score_name)

[LightGBM] [Info] Number of positive: 2370, number of negative: 2922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140141
[LightGBM] [Info] Number of data points in the train set: 5292, number of used features: 561
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.447846 -> initscore=-0.209378
[LightGBM] [Info] Start training from score -0.209378
LGBM accuracy score : 0.9966044142614601
[[311   1]
 [  1 276]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       312
           1       1.00      1.00      1.00       277

    accuracy                           1.00       589
   macro avg       1.00      1.00      1.00       589
weighted avg       1.00      1.00      1.00       589



In [570]:
result

Unnamed: 0,name,Activity_dynamic
0,LR,0.998302
1,KNN,1.0
2,DT,0.998302
3,RF,0.998302
4,XGB,0.998302
5,LGBM,0.996604
6,SVM,1.0
7,DL,0.0


### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [571]:
data_static = data.loc[data['Activity_dynamic']==0,:]
data_static.loc[:, 'Activity'].replace({'LAYING':0, 'SITTING':1, 'STANDING':2},inplace=True)
# x, y  분리
x = data_static.drop(columns=[target1, target2], axis=1)
y = data_static.loc[:, target1] # Activity

# 훈련 데이터 , 학습 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_static.loc[:, 'Activity'].replace({'LAYING':0, 'SITTING':1, 'STANDING':2},inplace=True)


In [572]:
score_name = 'only_static'

In [573]:
name = 'LR'
model = LogisticRegression(max_iter=500)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

LR accuracy score : 0.9660493827160493
[[113   0   0]
 [  0 101   3]
 [  0   8  99]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.93      0.97      0.95       104
           2       0.97      0.93      0.95       107

    accuracy                           0.97       324
   macro avg       0.97      0.97      0.97       324
weighted avg       0.97      0.97      0.97       324



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [574]:
name = 'KNN'
model = KNeighborsClassifier()
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

KNN accuracy score : 0.9444444444444444
[[112   1   0]
 [  0  94  10]
 [  0   7 100]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       113
           1       0.92      0.90      0.91       104
           2       0.91      0.93      0.92       107

    accuracy                           0.94       324
   macro avg       0.94      0.94      0.94       324
weighted avg       0.94      0.94      0.94       324



In [575]:
# DT
name = 'DT'
model = DecisionTreeClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

DT accuracy score : 0.9382716049382716
[[113   0   0]
 [  0  97   7]
 [  0  13  94]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.88      0.93      0.91       104
           2       0.93      0.88      0.90       107

    accuracy                           0.94       324
   macro avg       0.94      0.94      0.94       324
weighted avg       0.94      0.94      0.94       324



In [576]:
# RF
name = 'RF'
model = RandomForestClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

RF accuracy score : 0.9475308641975309
[[113   0   0]
 [  0  97   7]
 [  0  10  97]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.91      0.93      0.92       104
           2       0.93      0.91      0.92       107

    accuracy                           0.95       324
   macro avg       0.95      0.95      0.95       324
weighted avg       0.95      0.95      0.95       324



In [577]:
# svm
name = 'SVM'
model = SVC(kernel='linear')
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

SVM accuracy score : 0.9722222222222222
[[113   0   0]
 [  0 100   4]
 [  0   5 102]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.95      0.96      0.96       104
           2       0.96      0.95      0.96       107

    accuracy                           0.97       324
   macro avg       0.97      0.97      0.97       324
weighted avg       0.97      0.97      0.97       324



In [578]:
# xgb
name = 'XGB'
model = XGBClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)
# joblib.dump(model, 'only_static.pkl')

XGB accuracy score : 0.9876543209876543
[[113   0   0]
 [  0 104   0]
 [  0   4 103]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.96      1.00      0.98       104
           2       1.00      0.96      0.98       107

    accuracy                           0.99       324
   macro avg       0.99      0.99      0.99       324
weighted avg       0.99      0.99      0.99       324



In [579]:
x_test

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
5784,0.277646,-0.011806,-0.100328,-0.996564,-0.972248,-0.976464,-0.996565,-0.972251,-0.976928,-0.943374,...,0.180809,-0.436570,-0.822592,0.025468,-0.022118,-0.362214,-0.235848,-0.873367,0.172610,-0.030471
4246,0.283627,-0.003634,-0.113162,-0.993716,-0.933247,-0.937935,-0.994316,-0.927433,-0.931832,-0.934076,...,0.241658,-0.216179,-0.527463,0.025485,-0.110183,0.124452,0.695723,-0.786662,0.244272,0.038093
1941,0.273215,-0.017626,-0.109536,-0.989177,-0.995486,-0.989666,-0.989727,-0.994684,-0.988868,-0.931245,...,0.073132,-0.467592,-0.841995,0.239544,0.017011,-0.462243,-0.193406,0.678570,-0.582228,-0.416075
5205,0.280025,-0.016467,-0.099707,-0.998510,-0.989748,-0.980353,-0.998534,-0.989768,-0.980262,-0.942569,...,0.326831,-0.679869,-0.906166,-0.056807,0.147142,-0.030637,0.054931,-0.844499,0.197941,0.059498
3282,0.277957,-0.017634,-0.110310,-0.997951,-0.992788,-0.985679,-0.998208,-0.993112,-0.986222,-0.940541,...,0.045654,-0.046989,-0.417805,0.075910,-0.185529,-0.511842,-0.057414,-0.739847,-0.050085,-0.092297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738,0.276512,-0.016519,-0.108759,-0.996508,-0.990530,-0.981242,-0.997235,-0.989855,-0.978639,-0.937401,...,0.312999,-0.579166,-0.822822,0.225943,0.297958,0.437339,0.551523,-0.626728,-0.112376,-0.138055
3950,0.278583,-0.009776,-0.103464,-0.996529,-0.970977,-0.995630,-0.997017,-0.971456,-0.995814,-0.939260,...,0.064118,-0.233548,-0.634068,-0.296167,0.159006,0.027545,-0.009900,-0.564490,-0.151292,-0.155448
3943,0.283156,-0.022120,-0.124696,-0.994269,-0.946728,-0.979608,-0.995052,-0.940315,-0.979405,-0.937920,...,0.439220,-0.619295,-0.884802,0.043939,-0.039754,0.056114,-0.305584,-0.718426,0.104661,-0.186138
4994,0.271983,-0.013392,-0.112468,-0.972604,-0.904900,-0.954860,-0.976562,-0.921570,-0.961489,-0.912017,...,-0.325685,0.313204,-0.003194,0.084514,-0.134997,0.173917,0.308334,-0.545415,-0.212295,-0.047621


In [580]:
import re
x_train = x_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) # lgbm이 feature에 특수문자가 있는 데이터를 학습 못시킨다고 에러내서 없애는 코드;;

# lgbm
name = 'LGBM'
model = LGBMClassifier(max_depth=5, verbose=-1)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)
# joblib.dump(model, 'only_static_lgbm.pkl')

LGBM accuracy score : 0.9783950617283951
[[113   0   0]
 [  0 102   2]
 [  0   5 102]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       0.95      0.98      0.97       104
           2       0.98      0.95      0.97       107

    accuracy                           0.98       324
   macro avg       0.98      0.98      0.98       324
weighted avg       0.98      0.98      0.98       324



['only_static_lgbm.pkl']

In [581]:
result

Unnamed: 0,name,Activity_dynamic,only_static
0,LR,0.998302,0.966049
1,KNN,1.0,0.944444
2,DT,0.998302,0.938272
3,RF,0.998302,0.947531
4,XGB,0.998302,0.987654
5,LGBM,0.996604,0.978395
6,SVM,1.0,0.972222
7,DL,0.0,0.0


### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [582]:
data_d = data.loc[data['Activity_dynamic']==1,:]
data_d.loc[:, 'Activity'].replace({'WALKING':0, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':2},inplace=True)
# x, y  분리
x = data_d.drop(columns=[target1, target2], axis=1)
y = data_d.loc[:, target1] # Activity

# 훈련 데이터 , 학습 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_d.loc[:, 'Activity'].replace({'WALKING':0, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':2},inplace=True)


In [583]:
score_name = 'only_dynamic'

# LR
name = 'LR'
model = LogisticRegression(max_iter=500)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

# KNN
name = 'KNN'
model = KNeighborsClassifier()
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)
# joblib.dump(model, 'only_dynamic_knn.pkl')

# DT
name = 'DT'
model = DecisionTreeClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

# RF
name = 'RF'
model = RandomForestClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

# svm
name = 'SVM'
model = SVC(kernel='linear')
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)
# joblib.dump(model, 'only_dynamic_svm.pkl')

# xgb
name = 'XGB'
model = XGBClassifier(max_depth=5)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)

import re
x_train = x_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) # lgbm이 feature에 특수문자가 있는 데이터를 학습 못시킨다고 에러내서 없애는 코드;;

# lgbm
name = 'LGBM'
model = LGBMClassifier(max_depth=5, verbose=-1)
get_score_dynamic(name ,model, x_train, y_train, x_test, y_test, score_name)
# joblib.dump(model, 'only_dynamic_lgbm.pkl')

LR accuracy score : 0.9962264150943396
[[92  1  0]
 [ 0 91  0]
 [ 0  0 81]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        93
           1       0.99      1.00      0.99        91
           2       1.00      1.00      1.00        81

    accuracy                           1.00       265
   macro avg       1.00      1.00      1.00       265
weighted avg       1.00      1.00      1.00       265

KNN accuracy score : 0.9962264150943396
[[93  0  0]
 [ 0 91  0]
 [ 0  1 80]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        93
           1       0.99      1.00      0.99        91
           2       1.00      0.99      0.99        81

    accuracy                           1.00       265
   macro avg       1.00      1.00      1.00       265
weighted avg       1.00      1.00      1.00       265

DT accuracy score : 0.939622641509434
[[86  4  3]
 [ 4 85  2]
 [ 2  1 78]]
          

In [584]:
result

Unnamed: 0,name,Activity_dynamic,only_static,only_dynamic
0,LR,0.998302,0.966049,0.996226
1,KNN,1.0,0.944444,0.996226
2,DT,0.998302,0.938272,0.939623
3,RF,0.998302,0.947531,0.94717
4,XGB,0.998302,0.987654,0.984906
5,LGBM,0.996604,0.978395,0.996226
6,SVM,1.0,0.972222,1.0
7,DL,0.0,0.0,0.0


### 딥러닝

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.284379,-0.021981,-0.116683,-0.992490,-0.979640,-0.963321,-0.992563,-0.977304,-0.958142,-0.938850,...,-0.850065,-0.018043,0.092304,0.074220,-0.714534,-0.671943,-0.018351,-0.185733,22,SITTING
1,0.277440,-0.028086,-0.118412,-0.996620,-0.927676,-0.972294,-0.997346,-0.931405,-0.971788,-0.939837,...,-0.613367,-0.022456,-0.155414,0.247498,-0.112257,-0.826816,0.184489,-0.068699,15,STANDING
2,0.305833,-0.041023,-0.087303,0.006880,0.182800,-0.237984,0.005642,0.028616,-0.236474,0.016311,...,0.394388,-0.362616,0.171069,0.576349,-0.688314,-0.743234,0.272186,0.053101,22,WALKING
3,0.276053,-0.016487,-0.108381,-0.995379,-0.983978,-0.975854,-0.995877,-0.985280,-0.974907,-0.941425,...,-0.841455,0.289548,0.079801,-0.020033,0.291898,-0.639435,-0.111998,-0.123298,8,SITTING
4,0.271998,0.016904,-0.078856,-0.973468,-0.702462,-0.869450,-0.979810,-0.711601,-0.856807,-0.920760,...,0.214219,0.010111,0.114179,-0.830776,-0.325098,-0.840817,0.116237,-0.096615,5,STANDING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466,0.278725,-0.015262,-0.106398,-0.993625,-0.982845,-0.974745,-0.993963,-0.981100,-0.974596,-0.939303,...,-0.874066,-0.054788,0.712975,0.300318,-0.319188,-0.857336,0.120184,0.119276,14,SITTING
1467,0.275803,-0.019257,-0.109078,-0.998614,-0.991621,-0.987403,-0.998813,-0.991503,-0.986802,-0.945442,...,-0.721050,0.076333,-0.021599,-0.277268,0.754011,-0.764185,0.212111,0.138595,16,STANDING
1468,0.240402,0.006361,-0.121377,-0.045805,0.189930,0.332664,-0.114706,0.157771,0.195271,0.210139,...,-0.615554,0.330378,-0.667635,0.806563,-0.850113,-0.639564,0.185363,0.260201,8,WALKING_DOWNSTAIRS
1469,0.135873,-0.020675,-0.116644,-0.960526,-0.955134,-0.985818,-0.963115,-0.971338,-0.988261,-0.946289,...,-0.422383,-0.048474,0.236761,-0.186581,0.396648,0.790877,-0.474618,-0.505953,19,LAYING


In [585]:
# import joblib
# ft_data = joblib.load(project_path + 'features_df.pkl')
# ft_data.head()
# def pp(data, target):
#   x = data.drop(target, axis=1)
#   y = data.loc[:, target].astype(int)
#   x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
#   # 스케일링
#   scaler = MinMaxScaler()
#   x_train = scaler.fit_transform(x_train)
#   x_test = scaler.transform(x_test)

#   return x_train, x_test, y_train, y_test

### [선택사항] (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류 모델 합치기

In [586]:
def pipeline(data, is_d, only_d, only_s):
  # 데이터 전처리
  del_cols =['subject']
  data = data.drop(columns = del_cols, axis=1) # subject col 제거

  target = 'Activity'
  x = data.drop(target, axis=1) # x, y 나누기
  y = data.loc[:, target]
  print(f'x shape : {x.shape} y shape : {y.shape}')

  x = scaler.transform(x)  # 스케일링

  # 모델 추론
  y_pred1 = is_d.predict(x)  # 동적 행동 여부 모델 결과 저장
  data['is_dynamic_pred'] = y_pred1

  only_d_pred = data.loc[data['is_dynamic_pred']==1, :] # 동적 행동 여부에 따라 데이터 나누기
  only_s_pred = data.loc[data['is_dynamic_pred']==0, :]

  only_d_pred = only_d_pred.iloc[:, :-2] # feature가 아닌 열은 삭제
  only_s_pred = only_s_pred.iloc[:, :-2]

  y_pred2 = only_d.predict(only_d_pred) # is_dynamic에 따라 다른 모델 사용하고 결과 저장
  y_pred3 = only_s.predict(only_s_pred)

  # 결과 저장
  only_d_pred['Activity_pred'] = y_pred2 # 'WALKING':0, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':2
  only_s_pred['Activity_pred'] = y_pred3 # 'LAYING':0, 'SITTING':1, 'STANDING':2

  # 각각 결과 문자열로 변환
  only_d_pred['Activity_pred'] = only_d_pred['Activity_pred'].replace({0:'WALKING', 1:'WALKING_UPSTAIRS', 2:'WALKING_DOWNSTAIRS'})
  only_s_pred['Activity_pred'] = only_s_pred['Activity_pred'].replace({0:'LAYING', 1:'SITTING', 2:'STANDING'})

  # 하나로 합치기
  print('concat')
  pred = pd.concat([only_d_pred, only_s_pred], axis=0).sort_index() # 합치고 index 정렬
  y_pred = pred['Activity_pred']

  # 평가
  print('accuracy_score :', accuracy_score(y, y_pred))
  print(confusion_matrix(y, y_pred))
  print(classification_report(y, y_pred))

In [587]:
# 모델 불러오기
is_d_rf = joblib.load(project_path+'is_dynamic_rf.pkl')
only_d_knn = joblib.load(project_path+'only_dynamic_knn.pkl')
only_d_lgbm = joblib.load(project_path+'only_dynamic_lgbm.pkl')
only_d_svm = joblib.load(project_path+'only_dynamic_svm.pkl')
only_s_xgb = joblib.load(project_path+'only_static_xgb.pkl')
only_s_lgbm = joblib.load(project_path+'only_static_lgbm.pkl')
scaler = joblib.load(project_path+'scaler.pkl')

# test 데이터 불러오기
data = pd.read_csv(project_path+'data01_test.csv')

In [589]:
data2 = data.copy()
pipeline(data2, is_d_rf, only_d_lgbm, only_s_lgbm)

x shape : (1471, 561) y shape : (1471,)
concat
accuracy_score : 0.991162474507138
[[289   3   0   0   0   0]
 [  0 248   6   0   0   0]
 [  0   2 285   0   0   0]
 [  0   0   0 227   0   1]
 [  0   0   0   1 194   0]
 [  0   0   0   0   0 215]]
                    precision    recall  f1-score   support

            LAYING       1.00      0.99      0.99       292
           SITTING       0.98      0.98      0.98       254
          STANDING       0.98      0.99      0.99       287
           WALKING       1.00      1.00      1.00       228
WALKING_DOWNSTAIRS       1.00      0.99      1.00       195
  WALKING_UPSTAIRS       1.00      1.00      1.00       215

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471

