In [7]:
# Import Library
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score
from imblearn.over_sampling import SMOTE

In [8]:
# Load Data
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [9]:
# Data Preprocessing
# SUBCLASS가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

In [10]:
# 특성 및 타겟 변수 분리
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

In [11]:
# 범주형 변수 식별 및 Frequency Encoding
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    freq = X[col].value_counts()
    X[col] = X[col].map(freq)
    test[col] = test[col].map(freq)

In [12]:
### SMOTE를 사용한 오버샘플링 ###
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_subclass)

In [13]:
### 1. RandomForest 대신 CatBoost 사용 ###
# CatBoost, XGBoost, LightGBM 모델을 사용한 앙상블 구성
catboost_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    random_seed=42,
    verbose=0
)

xgb_model = xgb.XGBClassifier(
    n_estimators=233,
    max_depth=5,
    learning_rate=0.09066897671128973,
    subsample=0.7445604876879595,
    colsample_bytree=0.6030050347739594,
    random_state=42,
    eval_metric='mlogloss',
    use_label_encoder=False
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=259,
    max_depth=3,
    learning_rate=0.040719770187575215,
    subsample=0.9981840021295828,
    colsample_bytree=0.9255086919552659,
    random_state=42
)

In [15]:
from sklearn.linear_model import LogisticRegression

### 2. Stacking 앙상블 ###
# StackingClassifier 정의
estimators = [
    ('catboost', catboost_model),
    ('xgb', xgb_model),
    ('lgb', lgb_model)
]

stacking_model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),  # 최종 메타 모델
    cv=5
)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# 스태킹 모델 훈련
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196293
[LightGBM] [Info] Number of data points in the train set: 16348, number of used features: 4072
[LightGBM] [Info] Start training from score -3.281866
[LightGBM] [Info] Start training from score -3.241956
[LightGBM] [Info] Start training from score -3.257730
[LightGBM] [Info] Start training from score -3.262510
[LightGBM] [Info] Start training from score -3.268921
[LightGBM] [Info] Start training from score -3.267314
[LightGBM] [Info] Start training from score -3.254555
[LightGBM] [Info] Start training from score -3.256141
[LightGBM] [Info] Start training from score -3.272141
[LightGBM] [Info] Start training from score -3.248236
[LightGBM] [Info] Start training from score -3.248236
[LightGBM] [Info] Start training from score -3.262510
[LightGBM] [Info] Start training from score -3.265710
[Ligh

In [None]:
# 검증 세트에 대한 예측 및 성능 평가
y_val_pred = stacking_model.predict(X_val)
y_val_pred_proba = stacking_model.predict_proba(X_val)
validation_macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Log Loss: {log_loss(y_val, y_val_pred_proba)}")
print(f"Validation Macro F1 Score: {validation_macro_f1}")

In [None]:
# Inference
X_test = test.drop(columns=['ID'])

In [None]:
# 최종 예측 수행
predictions = stacking_model.predict(X_test)
original_labels = le_subclass.inverse_transform(predictions)

In [None]:
# Submission
submission = pd.read_csv("./sample_submission.csv")
submission["SUBCLASS"] = original_labels
submission.to_csv('./submission_with_catboost_0.7.csv', encoding='UTF-8-sig', index=False)