In [None]:
# Thyroid Cancer Classification - XGBoost + SMOTE

# ==============================================
# 개발환경 및 정보
# ==============================================
# OS: Windows 10 (제출 시 Linux 호환 필요)
# Python: 3.9.13
# pandas==2.2.3
# numpy==2.2.4
# scikit-learn==1.6.1
# xgboost==2.1.1
# lightgbm==4.3.0
# matplotlib==3.10.0
# seaborn==0.13.2
# imbalanced-learn==0.13.0

In [8]:
# ==============================================
# 1. 라이브러리 로딩
# ==============================================
import pandas as pd
import numpy as np
import os
import platform
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

print("Python Version:", sys.version)
print("OS:", platform.system())

Python Version: 3.13.2 | packaged by Anaconda, Inc. | (main, Feb  6 2025, 18:49:14) [MSC v.1929 64 bit (AMD64)]
OS: Windows


In [10]:
# ==============================================
# 2. 데이터 로드
# ==============================================

# 로컬과 제출 환경 호환을 위한 경로 자동 설정
def get_path(filename):
    return "/data/" + filename if os.path.exists("/data") else "data/" + filename

train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
sample_submission = pd.read_csv(get_path("sample_submission.csv"))

In [11]:
# ==============================================
# 3. 전처리
# ==============================================
X = train.drop(columns=['ID', 'Cancer'])
y = train['Cancer']
X_test = test.drop(columns=['ID'])

categorical_cols = X.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = X_test[col].map(lambda s: '<UNK>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<UNK>')
    X_test[col] = le.transform(X_test[col])


In [12]:
# ==============================================
# 4. Train/Validation Split + SMOTE
# ==============================================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [14]:
# ==============================================
# 5. 앙상블 모델 학습 (XGB + LGBM + RF + CatBoost)
# ==============================================
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
lgbm = LGBMClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)

xgb.fit(X_train_res, y_train_res)
lgbm.fit(X_train_res, y_train_res)
rf.fit(X_train_res, y_train_res)
cat.fit(X_train_res, y_train_res)

[LightGBM] [Info] Number of positive: 61360, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1125
[LightGBM] [Info] Number of data points in the train set: 122720, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


<catboost.core.CatBoostClassifier at 0x18f6d13f5c0>

In [15]:
# ==============================================
# 6. Threshold 최적화 (Soft Voting)
# ==============================================
xgb_pred = xgb.predict_proba(X_val)[:, 1]
lgbm_pred = lgbm.predict_proba(X_val)[:, 1]
rf_pred = rf.predict_proba(X_val)[:, 1]

ensemble_prob = (xgb_pred + lgbm_pred + rf_pred) / 3

precisions, recalls, thresholds = precision_recall_curve(y_val, ensemble_prob)
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)

best_idx = np.argmax(f1s)
best_threshold = thresholds[best_idx]
print(f"Best threshold: {best_threshold:.4f}, Best F1 Score: {f1s[best_idx]:.4f}")

Best threshold: 0.5293, Best F1 Score: 0.3461


In [16]:
# ==============================================
# 7. 성능 평가 지표 출력
# ==============================================
y_val_pred = (ensemble_prob >= best_threshold).astype(int)
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89     15340
           1       0.29      0.43      0.35      2092

    accuracy                           0.81     17432
   macro avg       0.60      0.64      0.62     17432
weighted avg       0.84      0.81      0.82     17432

Confusion Matrix:
 [[13174  2166]
 [ 1201   891]]


In [17]:
# ==============================================
# 8. 테스트 데이터 예측 및 제출 저장
# ==============================================
xgb_test = xgb.predict_proba(X_test)[:, 1]
lgbm_test = lgbm.predict_proba(X_test)[:, 1]
rf_test = rf.predict_proba(X_test)[:, 1]

ensemble_test_prob = (xgb_test + lgbm_test + rf_test) / 3
final_pred = (ensemble_test_prob >= best_threshold).astype(int)

sample_submission['Cancer'] = final_pred
sample_submission.to_csv(get_path("submission.csv"), index=False)

print("\n제출 파일이 저장되었습니다 →", get_path("submission.csv"))



제출 파일이 저장되었습니다 → data/submission.csv
