In [1]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import *
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")

In [2]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

In [3]:
X_test_self = pd.read_csv('Database/X_self_test.csv', index_col=0)
y_test_self = pd.read_csv('Database/y_self_test.csv', index_col=0)

In [4]:
i=0
sampling_result=['original']
best_dt = joblib.load(f'File/dt_{sampling_result[i]}_model.pkl')
best_lgb = joblib.load(f'File/lgb_{sampling_result[i]}_model.pkl')
best_xgb = joblib.load(f'File/xgb_{sampling_result[i]}_model.pkl')
best_cat = joblib.load(f'File/cat_{sampling_result[i]}_model.pkl')

In [5]:
proba1 = best_dt.predict_proba(X_test_self)
proba2 = best_lgb.predict_proba(X_test_self)
proba3 = best_xgb.predict_proba(X_test_self)
proba4 = best_cat.predict_proba(X_test_self)
average_proba = (proba1 + proba2 + proba3 + proba4) / 4

dt_result = np.argmax(proba1, axis=1)
lgb_result = np.argmax(proba2, axis=1)
xgb_result = np.argmax(proba3, axis=1)
cat_result = np.argmax(proba4, axis=1)
soft_voting_result = np.argmax(average_proba, axis=1)



In [6]:
metric_df = pd.DataFrame(columns=['DT', 'LGBM', 'XGB', 'CAT', 'Soft Voting'],
                         index=['Accuracy', 'Recall', 'Precision', 'F1-Score', 'AUC'])

In [7]:
def calculate_metrics(metric_df, y_true, y_pred, proba, model_name):
    metric_df.loc['Accuracy', model_name] = f"{accuracy_score(y_true, y_pred):.5f}"
    metric_df.loc['Recall', model_name] = f"{recall_score(y_true, y_pred, average='binary'):.5f}"
    metric_df.loc['Precision', model_name] = f"{precision_score(y_true, y_pred, average='binary'):.5f}"
    metric_df.loc['F1-Score', model_name] = f"{f1_score(y_true, y_pred, average='binary'):.5f}"
    metric_df.loc['AUC', model_name] = f"{roc_auc_score(y_true, proba[:, 1]):.5f}"

# 각 모델에 대해 메트릭 계산
models = {
    'DT': (dt_result, proba1),
    'LGBM': (lgb_result, proba2),
    'XGB': (xgb_result, proba3),
    'CAT': (cat_result, proba4),
    'Soft Voting': (soft_voting_result, average_proba),
}

for model_name, (y_pred, proba) in models.items():
    calculate_metrics(metric_df, y_test_self, y_pred, proba, model_name)

In [9]:
# metric_df.to_csv(f'File/metric_{sampling_result[i]}.csv')

In [None]:
# 특성 중요도 추출
dt_importances = best_dt.feature_importances_
lgb_importances = best_lgb.feature_importances_
xgb_importances = best_xgb.feature_importances_
cat_importances = best_cat.get_feature_importance()

# 특성 중요도를 데이터프레임으로 정리
features = X_test_self.columns
importance_df = pd.DataFrame({
    'Feature': features,
    'Decision Tree': dt_importances,
    'LightGBM': lgb_importances,
    'XGBoost': xgb_importances,
    'CatBoost': cat_importances
})

model_list = ['Decision Tree', 'LightGBM', 'XGBoost', 'CatBoost']

scaler = MinMaxScaler()
importnce_values = scaler.fit_transform(importance_df[model_list].values)

scaled_importance_df = pd.DataFrame(data=importnce_values, columns=model_list)
scaled_importance_df = pd.concat([importance_df['Feature'], scaled_importance_df], axis=1)

scaled_importance_df = scaled_importance_df.melt(id_vars=['Feature'], var_name='Model',
                                                 value_name='Importance')

# 시각화
plt.figure(figsize=(12, 8))
sns.barplot(data=scaled_importance_df[scaled_importance_df['Model']=='LightGBM'], x='Importance', y='Feature', hue='Model')
plt.title('Feature Importance Comparison')
plt.legend(loc='upper right')
plt.show()