In [None]:
# 기본 라이브러리
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 전처리 라이브러리
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# 모델링 라이브러리
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# 오류 방지
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('data/Train.csv')

# train_df.head()

# 파생변수 생성
train_df['Discount10'] = (train_df['Discount_offered'] > 10).astype(int)
train_df['Weight_log'] = np.log1p(train_df['Weight_in_gms'])
train_df['BlockMode'] = train_df['Warehouse_block'].astype(str) + '_' + train_df['Mode_of_Shipment'].astype(str)

# train_df.info()

In [None]:
# 수치형 변환
labelEn = LabelEncoder()
for col in ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender', 'BlockMode']:
    train_df[col] = labelEn.fit_transform(train_df[col].astype(str))

# train_df.head()

# 모델링에 사용할 컬럼 선택
columns = ['Discount10', 'Weight_log', 'BlockMode', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Discount_offered', 'Customer_care_calls', 'Customer_rating']

X = train_df[columns]
y = train_df['Reached.on.Time_Y.N']

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

# 데이터 불균형 처리
smote = SMOTE(random_state = 42)
X_train_smt, y_train_smt = smote.fit_resample(X_train, y_train)

# 모델링
# CatBoost, LightGBM, XGBoost 세 개 써보고 앙상블까지 다 했는데 Catboost가 가장 좋았음
cat_model = CatBoostClassifier(
    iterations = 500,
    learning_rate = 0.05,
    depth = 6,
    eval_metric = 'AUC',
    random_seed = 42,
    verbose = 100 
)

cat_model.fit(X_train_smt, y_train_smt, eval_set = (X_test, y_test))

In [None]:
# 1. ROC 커브 시각화 (모델의 변별력 확인)
y_probs = cat_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_probs)

fpr, tpr, thresholds = roc_curve(y_test, y_probs)

plt.figure(figsize=(12, 5))

# 첫 번째 그래프: ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {auc_score:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate (실수율)')
plt.ylabel('True Positive Rate (재현율)')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")

# 2. 변수 중요도 시각화 (어떤 변수가 지연에 큰 영향을 줬나?)
plt.subplot(1, 2, 2)
# CatBoost에서 중요도 가져오기
importances = cat_model.get_feature_importance()
feature_names = X_train.columns
feature_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index, palette='viridis')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('CatBoost Feature Importance')

plt.tight_layout()
plt.show()