In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import logging

# 로깅 설정
logging.basicConfig(filename='DT/training.log', level=logging.INFO, format='%(asctime)s %(message)s')

# 데이터셋 로드
data = pd.read_csv('data/train.csv')

# 필요한 열 선택
selected_columns = ['OC', 'sido', 'sgg', 'openDate', 'bedCount', 'instkind', 'revenue1', 'salescost1', 'sga1', 'salary1',
                    'noi1', 'noe1', 'interest1', 'ctax1', 'profit1', 'liquidAsset1', 'quickAsset1', 'receivableS1', 'inventoryAsset1',
                    'nonCAsset1', 'tanAsset1', 'OnonCAsset1', 'receivableL1', 'debt1', 'liquidLiabilities1', 'shortLoan1', 'NCLiabilities1',
                    'longLoan1', 'netAsset1', 'surplus1', 'revenue2', 'salescost2', 'sga2', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2',
                    'profit2', 'liquidAsset2', 'quickAsset2', 'receivableS2', 'inventoryAsset2', 'nonCAsset2', 'tanAsset2', 'OnonCAsset2',
                    'receivableL2', 'debt2', 'liquidLiabilities2', 'shortLoan2', 'NCLiabilities2', 'longLoan2', 'netAsset2', 'surplus2',
                    'employee1', 'employee2', 'ownerChange']

# 데이터 전처리
# - 결측치 처리
# data = data.dropna()  # 결측치가 있는 행 제거 또는 다른 방식으로 처리
# data = data.fillna(0)
# - 범주형 데이터 인코딩
categorical_cols = ['OC', 'sido', 'instkind', 'ownerChange']  # 범주형 열을 선택하여 리스트로 작성
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])
data = data.fillna(0)

# - 수치형 데이터 표준화
numeric_cols = ['sgg', 'openDate', 'bedCount', 'revenue1', 'salescost1', 'sga1', 'salary1', 'noi1', 'noe1', 'interest1',
                'ctax1', 'profit1', 'liquidAsset1', 'quickAsset1', 'receivableS1', 'inventoryAsset1', 'nonCAsset1', 'tanAsset1',
                'OnonCAsset1', 'receivableL1', 'debt1', 'liquidLiabilities1', 'shortLoan1', 'NCLiabilities1', 'longLoan1', 'netAsset1',
                'surplus1', 'revenue2', 'salescost2', 'sga2', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2', 'profit2', 'liquidAsset2',
                'quickAsset2', 'receivableS2', 'inventoryAsset2', 'nonCAsset2', 'tanAsset2', 'OnonCAsset2', 'receivableL2', 'debt2',
                'liquidLiabilities2', 'shortLoan2', 'NCLiabilities2', 'longLoan2', 'netAsset2', 'surplus2', 'employee1', 'employee2']

scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# 특성(X)과 라벨(y) 분리
X = data.drop('OC', axis=1).values
y = data['OC'].values

# 훈련 데이터와 검증 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1123)

# Decision Tree 모델 정의
model = DecisionTreeClassifier(random_state=42)

param_grid = {
    'max_depth': [None, 1, 2, 3, 5, 7, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 3, 4, 5],
}


# 그리드 서치 객체 정의
grid_search = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
grid_search.fit(X_train, y_train)

# 최적의 모델 선택
best_model = grid_search.best_estimator_

# 테스트 데이터에 대한 예측
y_pred = best_model.predict(X_test)

# 평가 지표 계산
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
pr_auc = average_precision_score(y_test, y_pred)

# 결과 출력
print("Best Model Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)
print("PR AUC:", pr_auc)


Best Model Parameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Accuracy: 0.8852459016393442
F1 Score: 0.9391304347826087
Precision: 0.9473684210526315
Recall: 0.9310344827586207
ROC AUC: 0.46551724137931033
PR AUC: 0.9476064383683913


350 fits failed out of a total of 2450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
350 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hjjung113/anaconda3/envs/hospital/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hjjung113/anaconda3/envs/hospital/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/hjjung113/anaconda3/envs/hospital/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/hjjung113/anaconda3/envs/hospital/lib/python3.11/site-packages/sklearn/base.py", line