In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import label_binarize
import numpy as np

# 로깅 설정
logging.basicConfig(filename='IF/training.log', level=logging.INFO, format='%(asctime)s %(message)s')

# 데이터셋 로드
data = pd.read_csv('data/train.csv')

# 필요한 열 선택
selected_columns = ['OC', 'sido', 'sgg', 'openDate', 'bedCount', 'instkind', 'revenue1', 'salescost1', 'sga1', 'salary1',
                    'noi1', 'noe1', 'interest1', 'ctax1', 'profit1', 'liquidAsset1', 'quickAsset1', 'receivableS1', 'inventoryAsset1',
                    'nonCAsset1', 'tanAsset1', 'OnonCAsset1', 'receivableL1', 'debt1', 'liquidLiabilities1', 'shortLoan1', 'NCLiabilities1',
                    'longLoan1', 'netAsset1', 'surplus1', 'revenue2', 'salescost2', 'sga2', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2',
                    'profit2', 'liquidAsset2', 'quickAsset2', 'receivableS2', 'inventoryAsset2', 'nonCAsset2', 'tanAsset2', 'OnonCAsset2',
                    'receivableL2', 'debt2', 'liquidLiabilities2', 'shortLoan2', 'NCLiabilities2', 'longLoan2', 'netAsset2', 'surplus2',
                    'employee1', 'employee2', 'ownerChange']

# 데이터 전처리
# - 결측치 처리
data = data.dropna()  # 결측치가 있는 행 제거 또는 다른 방식으로 처리
# - 범주형 데이터 인코딩
categorical_cols = ['OC', 'sido', 'instkind', 'ownerChange']  # 범주형 열을 선택하여 리스트로 작성
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])
# - 수치형 데이터 표준화
numeric_cols = ['sgg', 'openDate', 'bedCount', 'revenue1', 'salescost1', 'sga1', 'salary1', 'noi1', 'noe1', 'interest1',
                'ctax1', 'profit1', 'liquidAsset1', 'quickAsset1', 'receivableS1', 'inventoryAsset1', 'nonCAsset1', 'tanAsset1',
                'OnonCAsset1', 'receivableL1', 'debt1', 'liquidLiabilities1', 'shortLoan1', 'NCLiabilities1', 'longLoan1', 'netAsset1',
                'surplus1', 'revenue2', 'salescost2', 'sga2', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2', 'profit2', 'liquidAsset2',
                'quickAsset2', 'receivableS2', 'inventoryAsset2', 'nonCAsset2', 'tanAsset2', 'OnonCAsset2', 'receivableL2', 'debt2',
                'liquidLiabilities2', 'shortLoan2', 'NCLiabilities2', 'longLoan2', 'netAsset2', 'surplus2', 'employee1', 'employee2']

scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# 특성(X)과 라벨(y) 분리
X = data.drop('OC', axis=1).values
y = data['OC'].values

# 훈련 데이터와 검증 데이터 분할
# 레이블이 0인 데이터와 1인 데이터를 분리
data_zero = data[data['OC'] == 0]
data_one = data[data['OC'] == 1]

# 특성(X)과 라벨(y) 분리
X_zero = data_zero.drop('OC', axis=1).values
y_zero = data_zero['OC'].values
X_one = data_one.drop('OC', axis=1).values
y_one = data_one['OC'].values

# 레이블이 0인 데이터를 훈련 데이터와 검증 데이터로 분할
X_train, X_val_zero, y_train, y_val_zero = train_test_split(X_zero, y_zero, test_size=0.2, random_state=1123)

# 레이블이 1인 데이터를 검증 데이터로 사용
X_val_one = X_one
y_val_one = y_one

# 검증 데이터셋 합치기
X_val = np.concatenate([X_val_zero, X_val_one])
y_val = np.concatenate([y_val_zero, y_val_one])
# Isolation Forest 모델 생성
clf = IsolationForest(contamination='auto', random_state=1123)

# 모델 훈련
clf.fit(X_train)

# 훈련 세트에 대한 예측 및 검증 세트에 대한 예측
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

# y_train_pred_bin = label_binarize(y_train_pred, [-1, 1])
y_val_pred_bin = label_binarize(y_val_pred, [-1, 1])

# 성능 평가
print("Train set performance:")
print(classification_report(y_train, y_train_pred))

print("Validation set performance:")
print(classification_report(y_val, y_val_pred))

# 혼동 행렬 출력
print("Train confusion matrix:")
print(confusion_matrix(y_train, y_train_pred))

print("Validation confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))

# 로깅
logging.info("Train set performance:")
logging.info(classification_report(y_train, y_train_pred))

logging.info("Validation set performance:")
logging.info(classification_report(y_val, y_val_pred))

logging.info("Train confusion matrix:")
logging.info(confusion_matrix(y_train, y_train_pred))

logging.info("Validation confusion matrix:")
logging.info(confusion_matrix(y_val, y_val_pred))

# 성능 지표 계산
f1 = f1_score(y_val, y_val_pred_bin)
precision = precision_score(y_val, y_val_pred_bin)
recall = recall_score(y_val, y_val_pred_bin)
roc_auc = roc_auc_score(y_val, y_val_pred_bin)
pr_auc = average_precision_score(y_val, y_val_pred_bin)

# 결과 출력
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")

# 로깅
logging.info(f"F1 Score: {f1}")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"ROC AUC: {roc_auc}")
logging.info(f"PR AUC: {pr_auc}")

TypeError: label_binarize() takes 1 positional argument but 2 were given