#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.





### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - data01_test.csv 를 불러와 'new_data' 이름으로 저장합니다.


In [3]:
data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/data01_train.csv')
new_data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/data01_test.csv')

In [4]:
data.drop(columns=["subject"], inplace=True)

In [34]:
new_data.drop(columns=["subject"], inplace=True)

## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [25]:
new_data['Activity_dynamic'] = new_data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS']).astype(int)

In [6]:
data['Activity_dynamic'] = data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS']).astype(int)

In [8]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=['Activity', 'Activity_dynamic'])
y1 = data['Activity']
y2 = data['Activity_dynamic']

x_train, x_val, y1_train, y1_val = train_test_split(x, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_val = train_test_split(x, y2, test_size=0.2, random_state=42)

In [48]:
test_x = new_data.drop(columns=['Activity', 'Activity_dynamic'])
test_y1 = new_data['Activity']
test_y2 = new_data['Activity_dynamic']

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) 알고리즘1 :

In [317]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(x_train, y2_train)

pred_rf = model_rf.predict(x_val)

accuracy_rf = accuracy_score(y2_val, pred_rf)

print("RandomForest Activity Model Accuracy:", accuracy_rf)
print(confusion_matrix(y2_val, pred_rf))
print(classification_report(y2_val, pred_rf))

RandomForest Activity Model Accuracy: 1.0
[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [318]:
t_pred_rf = model_rf.predict(test_x)
test_accuracy_rf = accuracy_score(test_y2, t_pred_rf)

print("TEST RandomForest Activity Model Accuracy:", test_accuracy_rf)
print(confusion_matrix(test_y2, t_pred_rf))
print(classification_report(test_y2, t_pred_rf))

TEST RandomForest Activity Model Accuracy: 1.0
[[833   0]
 [  0 638]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       833
           1       1.00      1.00      1.00       638

    accuracy                           1.00      1471
   macro avg       1.00      1.00      1.00      1471
weighted avg       1.00      1.00      1.00      1471



#### 2) 알고리즘2 :

In [319]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(random_state=42)

model_lr.fit(x_train, y2_train)

pred_lr = model_lr.predict(x_val)

accuracy_lr = accuracy_score(y2_val, pred_lr)

print("Logistic Regression Activity Model Accuracy:", accuracy_lr)
print(confusion_matrix(y2_val, pred_lr))
print(classification_report(y2_val, pred_lr))

Logistic Regression Activity Model Accuracy: 1.0
[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [320]:
t_pred_lr = model_lr.predict(test_x)
test_accuracy_lr = accuracy_score(test_y2, t_pred_lr)

print("TEST Logistic Regression Activity Model Accuracy:", test_accuracy_lr)
print(confusion_matrix(test_y2, t_pred_lr))
print(classification_report(test_y2, t_pred_lr))

TEST Logistic Regression Activity Model Accuracy: 0.9993201903467029
[[832   1]
 [  0 638]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       833
           1       1.00      1.00      1.00       638

    accuracy                           1.00      1471
   macro avg       1.00      1.00      1.00      1471
weighted avg       1.00      1.00      1.00      1471



#### 3) 알고리즘3 :

In [321]:
from sklearn.svm import SVC

model_svm = SVC(random_state=42)

model_svm.fit(x_train, y2_train)

pred_svm = model_svm.predict(x_val)

accuracy_svm = accuracy_score(y2_val, pred_svm)

print("SVM Activity Model Accuracy:", accuracy_svm)
print(confusion_matrix(y2_val, pred_svm))
print(classification_report(y2_val, pred_svm))

SVM Activity Model Accuracy: 1.0
[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [322]:
t_pred_svm = model_svm.predict(test_x)
test_accuracy_svm = accuracy_score(test_y2, t_pred_svm)

print("TEST SVM Activity Model Accuracy:", test_accuracy_svm)
print(confusion_matrix(test_y2, t_pred_svm))
print(classification_report(test_y2, t_pred_svm))

TEST SVM Activity Model Accuracy: 1.0
[[833   0]
 [  0 638]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       833
           1       1.00      1.00      1.00       638

    accuracy                           1.00      1471
   macro avg       1.00      1.00      1.00      1471
weighted avg       1.00      1.00      1.00      1471



### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [324]:
subset_data = data[data['Activity'].isin(['LAYING', 'SITTING', 'STANDING'])]

X_subset = subset_data.drop(columns=['Activity', 'Activity_dynamic'])
y_subset = subset_data['Activity']

X_train, X_val, y_train, y_val = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

In [330]:
t_subset_data = new_data[new_data['Activity'].isin(['LAYING', 'SITTING', 'STANDING'])]

t_X_subset = t_subset_data.drop(columns=['Activity', 'Activity_dynamic'])
t_y_subset = t_subset_data['Activity']

In [331]:
rf_classifier = RandomForestClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("RandomForest Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

RandomForest Model Accuracy: 0.9799072642967542
[[221   0   0]
 [  0 202   3]
 [  0  10 211]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.95      0.99      0.97       205
    STANDING       0.99      0.95      0.97       221

    accuracy                           0.98       647
   macro avg       0.98      0.98      0.98       647
weighted avg       0.98      0.98      0.98       647



In [332]:
t_y_pred = rf_classifier.predict(t_X_subset)
t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST RandomForest Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST RandomForest Model Accuracy: 0.9651860744297719
[[291   1   0]
 [  0 242  12]
 [  0  16 271]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       292
     SITTING       0.93      0.95      0.94       254
    STANDING       0.96      0.94      0.95       287

    accuracy                           0.97       833
   macro avg       0.96      0.96      0.96       833
weighted avg       0.97      0.97      0.97       833



In [333]:
lr_classifier = LogisticRegression(random_state=42)

lr_classifier.fit(X_train, y_train)

y_pred = lr_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("LogisticRegression Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

LogisticRegression Model Accuracy: 0.9428129829984544
[[221   0   0]
 [  0 189  16]
 [  0  21 200]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.90      0.92      0.91       205
    STANDING       0.93      0.90      0.92       221

    accuracy                           0.94       647
   macro avg       0.94      0.94      0.94       647
weighted avg       0.94      0.94      0.94       647



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [334]:
t_y_pred = lr_classifier.predict(t_X_subset)
t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST LogisticRegression Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST LogisticRegression Model Accuracy: 0.9303721488595438
[[290   2   0]
 [  0 227  27]
 [  0  29 258]]
              precision    recall  f1-score   support

      LAYING       1.00      0.99      1.00       292
     SITTING       0.88      0.89      0.89       254
    STANDING       0.91      0.90      0.90       287

    accuracy                           0.93       833
   macro avg       0.93      0.93      0.93       833
weighted avg       0.93      0.93      0.93       833



In [335]:
svm_classifier = SVC(random_state=42)

svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("SVC Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

SVC Model Accuracy: 0.9211746522411128
[[221   0   0]
 [  0 178  27]
 [  0  24 197]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.88      0.87      0.87       205
    STANDING       0.88      0.89      0.89       221

    accuracy                           0.92       647
   macro avg       0.92      0.92      0.92       647
weighted avg       0.92      0.92      0.92       647



In [336]:
t_y_pred = svm_classifier.predict(t_X_subset)
t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST LogisticRegression Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST LogisticRegression Model Accuracy: 0.9111644657863145
[[292   0   0]
 [  0 212  42]
 [  0  32 255]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       292
     SITTING       0.87      0.83      0.85       254
    STANDING       0.86      0.89      0.87       287

    accuracy                           0.91       833
   macro avg       0.91      0.91      0.91       833
weighted avg       0.91      0.91      0.91       833



### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [337]:
dynamic_data = data[data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

X_subset = dynamic_data.drop(columns=['Activity', 'Activity_dynamic'])
y_subset = dynamic_data['Activity']

X_train, X_val, y_train, y_val = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

In [338]:
t_dynamic_data = new_data[new_data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

t_X_subset = dynamic_data.drop(columns=['Activity', 'Activity_dynamic'])
t_y_subset = dynamic_data['Activity']

In [339]:
rf_classifier = RandomForestClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("RandomForest Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

RandomForest Model Accuracy: 0.9849056603773585
[[193   3   0]
 [  1 159   3]
 [  1   0 170]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.98      0.99       196
WALKING_DOWNSTAIRS       0.98      0.98      0.98       163
  WALKING_UPSTAIRS       0.98      0.99      0.99       171

          accuracy                           0.98       530
         macro avg       0.98      0.98      0.98       530
      weighted avg       0.98      0.98      0.98       530



In [340]:
t_y_pred = rf_classifier.predict(t_X_subset)

t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST RandomForest Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST RandomForest Model Accuracy: 0.9969777106157914
[[995   3   0]
 [  1 787   3]
 [  1   0 857]]
                    precision    recall  f1-score   support

           WALKING       1.00      1.00      1.00       998
WALKING_DOWNSTAIRS       1.00      0.99      1.00       791
  WALKING_UPSTAIRS       1.00      1.00      1.00       858

          accuracy                           1.00      2647
         macro avg       1.00      1.00      1.00      2647
      weighted avg       1.00      1.00      1.00      2647



In [341]:
lr_classifier = LogisticRegression(random_state=42)

lr_classifier.fit(X_train, y_train)

y_pred = lr_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("LogisticRegression Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

LogisticRegression Model Accuracy: 0.9811320754716981
[[193   2   1]
 [  0 160   3]
 [  4   0 167]]
                    precision    recall  f1-score   support

           WALKING       0.98      0.98      0.98       196
WALKING_DOWNSTAIRS       0.99      0.98      0.98       163
  WALKING_UPSTAIRS       0.98      0.98      0.98       171

          accuracy                           0.98       530
         macro avg       0.98      0.98      0.98       530
      weighted avg       0.98      0.98      0.98       530



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [342]:
t_y_pred = lr_classifier.predict(t_X_subset)

t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST RandomForest Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST RandomForest Model Accuracy: 0.9871552701171137
[[988   4   6]
 [  3 782   6]
 [ 11   4 843]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.99      0.99       998
WALKING_DOWNSTAIRS       0.99      0.99      0.99       791
  WALKING_UPSTAIRS       0.99      0.98      0.98       858

          accuracy                           0.99      2647
         macro avg       0.99      0.99      0.99      2647
      weighted avg       0.99      0.99      0.99      2647



In [343]:
svm_classifier = SVC(random_state=42)

svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)

print("SVC Model Accuracy:", accuracy)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

SVC Model Accuracy: 0.9811320754716981
[[195   0   1]
 [  0 160   3]
 [  5   1 165]]
                    precision    recall  f1-score   support

           WALKING       0.97      0.99      0.98       196
WALKING_DOWNSTAIRS       0.99      0.98      0.99       163
  WALKING_UPSTAIRS       0.98      0.96      0.97       171

          accuracy                           0.98       530
         macro avg       0.98      0.98      0.98       530
      weighted avg       0.98      0.98      0.98       530



In [344]:
t_y_pred = svm_classifier.predict(t_X_subset)

t_accuracy = accuracy_score(t_y_subset, t_y_pred)

print("TEST RandomForest Model Accuracy:", t_accuracy)
print(confusion_matrix(t_y_subset, t_y_pred))
print(classification_report(t_y_subset, t_y_pred))

TEST RandomForest Model Accuracy: 0.9845107669059312
[[988   1   9]
 [  4 779   8]
 [ 11   8 839]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.99      0.99       998
WALKING_DOWNSTAIRS       0.99      0.98      0.99       791
  WALKING_UPSTAIRS       0.98      0.98      0.98       858

          accuracy                           0.98      2647
         macro avg       0.98      0.98      0.98      2647
      weighted avg       0.98      0.98      0.98      2647



### [선택사항] (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

In [345]:
x = data.drop(columns=['Activity'])
y = data['Activity']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(x_train, y_train)

importance = rf_model.feature_importances_
names = x_train.columns

feature_importance_df = pd.DataFrame({'Feature': names, 'Importance': importance})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#### 1) 함수 만들어서 분류 모델 합치기

In [101]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [346]:
data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/data01_train.csv')
top_n_features = feature_importance_df['Feature'].head(300).tolist()
top_n_features.append("Activity")
top_n_features.append("Activity_dynamic")

data.drop(columns=["subject"], inplace=True)
data['Activity_dynamic'] = data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS']).astype(int)
columns_to_drop = [col for col in data.columns if col not in top_n_features]
data.drop(columns=columns_to_drop, inplace=True)

x = data.drop(columns=['Activity', 'Activity_dynamic'])

y1 = data['Activity']
y2 = data['Activity_dynamic']

x_train, x_val, y1_train, y1_val = train_test_split(x, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_val = train_test_split(x, y2, test_size=0.2, random_state=42)

In [347]:
static_data = data[data['Activity'].isin(['LAYING', 'SITTING', 'STANDING'])]

s_X_subset = static_data.drop(columns=['Activity', 'Activity_dynamic'])
s_y_subset = static_data['Activity']

x_train_static, X_val_static, y_train_static, y_val_static = train_test_split(s_X_subset, s_y_subset, test_size=0.2, random_state=42)

In [348]:
dynamic_data = new_data[data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

d_X_subset = dynamic_data.drop(columns=['Activity', 'Activity_dynamic'])
d_y_subset = dynamic_data['Activity']

x_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(d_X_subset, d_y_subset, test_size=0.2, random_state=42)

  dynamic_data = new_data[data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]


In [349]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=['Activity', 'Activity_dynamic'])
y1 = data['Activity']
y2 = data['Activity_dynamic']

x_train, x_val, y1_train, y1_val = train_test_split(x, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_val = train_test_split(x, y2, test_size=0.2, random_state=42)

In [350]:
numeric_features = [col for col in x.columns]

# 전처리 및 모델링을 위한 파이프라인 구성
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # 숫자형 변수 스케일링
    ])

# 파이프라인 구성
pipeline_isdynamic_classification = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # 랜덤 포레스트 분류기
])

In [351]:
# 파이프라인 훈련
pipeline_isdynamic_classification.fit(x_train, y2_train)

In [352]:
# 예측
y_isdynamic_pred = pipeline_isdynamic_classification.predict(x_val)

# 동적인 데이터를 분류하는 결과를 바탕으로 해당 데이터가 동적인 경우 동적 활동을 분류하는 모델과 정적 활동을 분류하는 모델을 선택
dynamic_indices = np.where(y_isdynamic_pred == 1)[0]
static_indices = np.where(y_isdynamic_pred == 0)[0]

In [353]:
x_val.reset_index(drop=True, inplace=True)

x_val_dynamic, x_val_static = x_val.iloc[dynamic_indices], x_val.iloc[static_indices]

In [354]:
# 동적 활동 분류 모델 구성
pipeline_dynamic_activity = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # 랜덤 포레스트 분류기
])

# 정적 활동 분류 모델 구성
pipeline_static_activity = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))  # 로지스틱 회귀 분류기
])

# 동적 활동 분류 모델 훈련
pipeline_dynamic_activity.fit(x_train_dynamic, y_train_dynamic)

# 정적 활동 분류 모델 훈련
pipeline_static_activity.fit(x_train_static, y_train_static)

# 예측
y_dynamic_activity_pred = pipeline_dynamic_activity.predict(x_val_dynamic)
y_static_activity_pred = pipeline_static_activity.predict(x_val_static)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [355]:
final_prediction = []
d = 0
s = 0

for idx in range(len(y_isdynamic_pred)):
    if y_isdynamic_pred[idx] == 1:
        final_prediction.append(y_dynamic_activity_pred[d])
        d += 1
    else:
        final_prediction.append(y_static_activity_pred[s])
        s += 1

In [356]:
ac = accuracy_score(final_prediction, y1_val)

In [357]:
print(ac)

0.6652506372132541


In [358]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

print(confusion_matrix(y1_val, final_prediction))
print(classification_report(y1_val, final_prediction))

[[231   0   0   0   0   0]
 [  0 196   4   0   0   0]
 [  0  15 211   0   0   0]
 [  0   0   0   0 198   0]
 [  0   0   0   0 145   0]
 [  0   0   0   0 177   0]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       231
           SITTING       0.93      0.98      0.95       200
          STANDING       0.98      0.93      0.96       226
           WALKING       0.00      0.00      0.00       198
WALKING_DOWNSTAIRS       0.28      1.00      0.44       145
  WALKING_UPSTAIRS       0.00      0.00      0.00       177

          accuracy                           0.67      1177
         macro avg       0.53      0.65      0.56      1177
      weighted avg       0.58      0.67      0.60      1177



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 테스트 데이터

In [359]:
new_data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/data01_test.csv')

new_data.drop(columns=["subject"], inplace=True)
new_data['Activity_dynamic'] = new_data['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS']).astype(int)
columns_to_drop = [col for col in new_data.columns if col not in top_n_features]
new_data.drop(columns=columns_to_drop, inplace=True)

test_x = new_data.drop(columns=['Activity', 'Activity_dynamic'])

test_y1 = new_data['Activity']
test_y2 = new_data['Activity_dynamic']

In [266]:
numeric_features = [col for col in test_x.columns]

In [360]:
test_isdynamic_pred = pipeline_isdynamic_classification.predict(test_x)

test_dynamic_indices = np.where(test_isdynamic_pred == 1)[0]
test_static_indices = np.where(test_isdynamic_pred == 0)[0]

In [361]:
test_x.reset_index(drop=True, inplace=True)

test_x_dynamic, test_x_static = test_x.iloc[test_dynamic_indices], test_x.iloc[test_static_indices]

In [362]:
test_dynamic_activity_pred = pipeline_dynamic_activity.predict(test_x_dynamic)
test_static_activity_pred = pipeline_static_activity.predict(test_x_static)

In [363]:
test_final_prediction = []
d = 0
s = 0

for idx in range(len(test_isdynamic_pred)):
    if test_isdynamic_pred[idx] == 1:
        test_final_prediction.append(test_dynamic_activity_pred[d])
        d += 1
    else:
        test_final_prediction.append(test_static_activity_pred[s])
        s += 1

In [364]:
test_ac = accuracy_score(test_final_prediction, test_y1)

In [365]:
print(test_ac)

0.4738273283480625


In [366]:
print(confusion_matrix(test_y1, test_final_prediction))
print(classification_report(test_y1, test_final_prediction))

[[211   0   0  42  15  24]
 [  0 145  10  56   9  34]
 [  0  11 146  82   3  45]
 [  0   0   0   0 228   0]
 [  0   0   0   0 195   0]
 [  0   0   0   0 215   0]]
                    precision    recall  f1-score   support

            LAYING       1.00      0.72      0.84       292
           SITTING       0.93      0.57      0.71       254
          STANDING       0.94      0.51      0.66       287
           WALKING       0.00      0.00      0.00       228
WALKING_DOWNSTAIRS       0.29      1.00      0.45       195
  WALKING_UPSTAIRS       0.00      0.00      0.00       215

          accuracy                           0.47      1471
         macro avg       0.53      0.47      0.44      1471
      weighted avg       0.58      0.47      0.48      1471

