In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# 파일 경로 변경
train_path = 'train_ds.csv'
test_path = 'test_ds.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
# label 컬럼에서 'g'의 갯수를 셉니다.
g_count = train_df[train_df['label'] == 'g'].shape[0]
# label 컬럼에서 'b'의 갯수를 셉니다.
b_count = train_df[train_df['label'] == 'b'].shape[0]

print(f"'b'의 갯수: {g_count}")
print(f"'g'의 갯수: {b_count}")

'b'의 갯수: 158
'g'의 갯수: 87


In [None]:
# label 컬럼을 제외한 나머지 컬럼 선택
features = train_df.drop(columns=['label'])

# StandardScaler 객체 생성
scaler = StandardScaler()

# 표준화 수행
standardized_features = scaler.fit_transform(features)

# 표준화된 데이터프레임 생성
standardized_df = pd.DataFrame(standardized_features, columns=features.columns)

# label 컬럼을 다시 추가|
standardized_df['label'] = train_df['label'].values

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    standardized_df.drop("label", axis=1),
    standardized_df["label"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [None]:
model = DecisionTreeClassifier(random_state = 400)

In [None]:
model.fit(x_train.fillna(0), y_train)

In [None]:
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred, labels=['b', 'g'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=['b', 'g'], pos_label='g')
    recall = recall_score(y_test, y_pred, labels=['b', 'g'], pos_label='g')
    F1 = f1_score(y_test, y_pred, labels=['b', 'g'], pos_label='g')

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)


오차행렬:
 [[13  2]
 [ 3 31]]

정확도: 0.8980
정밀도: 0.9394
재현율: 0.9118
F1: 0.9254


In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[13  2]
 [ 3 31]]

정확도: 0.8980
정밀도: 0.9394
재현율: 0.9118
F1: 0.9254
