In [1]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
X = pd.read_csv("train.csv")

In [3]:
y = X["Activity"]

In [4]:
for column in X.columns[:-1]:  # 마지막 열은 'Activity'이므로 제외
    mask = X[column] == -1
    for activity in X['Activity'].unique():
        activity_mean = X.loc[X['Activity'] == activity, column].mean()
        X[column] = X[column].astype(float)  # 해당 column을 float64로 변환
        X.loc[mask & (X['Activity'] == activity), column] = activity_mean

In [5]:
# 'Activity' 열을 제외한 나머지 데이터로 X를 갱신
X = X.drop(columns=["Activity"])

In [6]:
print(X["fBodyAcc-max()-Y"][56])
print(y)

-0.941740922657427
0               STANDING
1               STANDING
2               STANDING
3               STANDING
4               STANDING
              ...       
7347    WALKING_UPSTAIRS
7348    WALKING_UPSTAIRS
7349    WALKING_UPSTAIRS
7350    WALKING_UPSTAIRS
7351    WALKING_UPSTAIRS
Name: Activity, Length: 7352, dtype: object


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
rf_model = RandomForestClassifier(n_estimators=296, random_state=42)
cv = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy_values = cross_val_score(rf_model, X, y, cv=cv, scoring='accuracy')

In [9]:
y_pred = cross_val_predict(rf_model, X, y, cv=cv)

In [10]:
conf_matrix = confusion_matrix(y, y_pred)


In [13]:
print("Confusion Matrix:\n", conf_matrix)
print("Average Accuracy:", np.mean(accuracy_values))


Confusion Matrix:
 [[1407    0    0    0    0    0]
 [   0 1285    0    0    0    1]
 [   0    0 1374    0    0    0]
 [   0    0    0 1226    0    0]
 [   0    0    0    0  984    2]
 [   0    0    0    0    2 1071]]
Average Accuracy: 0.9993197278911564
