In [39]:
# Thyroid Cancer Classification - XGBoost + SMOTE

# ==============================================
# 개발환경 및 정보
# ==============================================
# OS: Windows 10
# Python: 3.9.13
# 사용 라이브러리 및 버전:
# pandas==2.2.3
# numpy==2.2.4
# scikit-learn==1.6.1
# imbalanced-learn==0.13.0
# xgboost==2.1.1
# matplotlib==3.10.0
# seaborn==0.13.2
# 기타: ipykernel, joblib, scipy, etc.
# 제출 파일: submission.csv (sample_submission.csv와 동일 형식 유지)

In [40]:
# ==============================================
# 1. 라이브러리 로딩
# ==============================================
import pandas as pd
import numpy as np
import platform
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

print("Python Version:", sys.version)
print("OS:", platform.system())

Python Version: 3.13.2 | packaged by Anaconda, Inc. | (main, Feb  6 2025, 18:49:14) [MSC v.1929 64 bit (AMD64)]
OS: Windows


In [41]:
# ==============================================
# 2. 데이터 로드
# ==============================================

# 로컬과 제출 환경 호환을 위한 경로 자동 설정
def get_path(filename):
    return "/data/" + filename if os.path.exists("/data") else "data/" + filename

train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
sample_submission = pd.read_csv(get_path("sample_submission.csv"))

In [42]:
# ==============================================
# 3. 전처리
# ==============================================
X = train.drop(columns=['ID', 'Cancer'])
y = train['Cancer']
X_test = test.drop(columns=['ID'])

categorical_cols = X.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = X_test[col].map(lambda s: '<UNK>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<UNK>')
    X_test[col] = le.transform(X_test[col])

In [43]:
# ==============================================
# 4. Train/Validation Split + SMOTE
# ==============================================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [44]:
# ==============================================
# 5. 모델 학습
# ==============================================
model = XGBClassifier(random_state=42, eval_metric='logloss')
model.fit(X_train_res, y_train_res)
print("모델 학습 완료")

모델 학습 완료


In [45]:
# ==============================================
# 6. Threshold 최적화
# ==============================================
y_val_prob = model.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_prob)
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)

best_idx = np.argmax(f1s)
best_threshold = thresholds[best_idx]
print(f"Best Threshold: {best_threshold:.4f}, Best F1 Score: {f1s[best_idx]:.4f}")

Best Threshold: 0.6728, Best F1 Score: 0.3444


In [46]:
# ==============================================
# 7. 성능 평가 지표 출력
# ==============================================
y_val_pred = (y_val_prob >= best_threshold).astype(int)
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.91      0.91      0.91     15340
           1       0.35      0.34      0.34      2092

    accuracy                           0.84     17432
   macro avg       0.63      0.63      0.63     17432
weighted avg       0.84      0.84      0.84     17432

Confusion Matrix:
 [[14000  1340]
 [ 1378   714]]


In [47]:
# ==============================================
# 8. 테스트 데이터 예측 및 제출 저장
# ==============================================
final_pred = (model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)
sample_submission['Cancer'] = final_pred
sample_submission.to_csv(get_path("submission.csv"), index=False)

print("\n제출 파일이 저장되었습니다 →", get_path("submission.csv"))


제출 파일이 저장되었습니다 → data/submission.csv
