In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

In [5]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Age,Gender,Country,Race,Family_Background,Radiation_History,Iodine_Deficiency,Smoke,Weight_Risk,Diabetes,Nodule_Size,TSH_Result,T4_Result,T3_Result,Cancer
0,TRAIN_00000,80,M,CHN,ASN,Positive,Exposed,Sufficient,Non-Smoker,Not Obese,No,0.650355,2.784735,6.744603,2.57582,1
1,TRAIN_00001,37,M,NGA,ASN,Positive,Unexposed,Sufficient,Smoker,Obese,No,2.95043,0.911624,7.303305,2.505317,1
2,TRAIN_00002,71,M,CHN,MDE,Positive,Unexposed,Sufficient,Non-Smoker,Not Obese,Yes,2.200023,0.717754,11.137459,2.38108,0
3,TRAIN_00003,40,F,IND,HSP,Negative,Unexposed,Sufficient,Non-Smoker,Obese,No,3.370796,6.84638,10.175254,0.753023,0
4,TRAIN_00004,53,F,CHN,CAU,Negative,Unexposed,Sufficient,Non-Smoker,Not Obese,No,4.230048,0.439519,7.19445,0.569356,1


In [6]:
df.shape

(87159, 16)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87159 entries, 0 to 87158
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 87159 non-null  object 
 1   Age                87159 non-null  int64  
 2   Gender             87159 non-null  object 
 3   Country            87159 non-null  object 
 4   Race               87159 non-null  object 
 5   Family_Background  87159 non-null  object 
 6   Radiation_History  87159 non-null  object 
 7   Iodine_Deficiency  87159 non-null  object 
 8   Smoke              87159 non-null  object 
 9   Weight_Risk        87159 non-null  object 
 10  Diabetes           87159 non-null  object 
 11  Nodule_Size        87159 non-null  float64
 12  TSH_Result         87159 non-null  float64
 13  T4_Result          87159 non-null  float64
 14  T3_Result          87159 non-null  float64
 15  Cancer             87159 non-null  int64  
dtypes: float64(4), int64(2

In [8]:
df.isnull().sum()

ID                   0
Age                  0
Gender               0
Country              0
Race                 0
Family_Background    0
Radiation_History    0
Iodine_Deficiency    0
Smoke                0
Weight_Risk          0
Diabetes             0
Nodule_Size          0
TSH_Result           0
T4_Result            0
T3_Result            0
Cancer               0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Age,Nodule_Size,TSH_Result,T4_Result,T3_Result,Cancer
count,87159.0,87159.0,87159.0,87159.0,87159.0,87159.0
mean,50.860244,2.508098,5.056871,8.248796,2.005002,0.119999
std,21.638687,1.441928,2.861229,2.165975,0.867013,0.324962
min,14.0,0.0,0.1,4.5,0.5,0.0
25%,32.0,1.270332,2.582845,6.372031,1.254836,0.0
50%,51.0,2.520941,5.059085,8.236789,2.004101,0.0
75%,70.0,3.760938,7.541586,10.126981,2.758032,0.0
max,88.0,5.0,10.0,12.0,3.5,1.0


In [10]:
df.columns

Index(['ID', 'Age', 'Gender', 'Country', 'Race', 'Family_Background',
       'Radiation_History', 'Iodine_Deficiency', 'Smoke', 'Weight_Risk',
       'Diabetes', 'Nodule_Size', 'TSH_Result', 'T4_Result', 'T3_Result',
       'Cancer'],
      dtype='object')

In [11]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_thyroid_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    # Target
    y_train = train['Cancer']
    train.drop(columns=['Cancer'], inplace=True)

    # 저장용 ID
    train_ids = train['ID']
    test_ids = test['ID']

    # ID 제거
    train.drop(columns=['ID'], inplace=True)
    test.drop(columns=['ID'], inplace=True)

    # -------------------------
    # 범주형 인코딩
    # -------------------------
    categorical_cols = train.select_dtypes(include='object').columns
    label_encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train[col], test[col]]))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
        label_encoders[col] = le

    # -------------------------
    # 수치형 정규화 (표준화)
    # -------------------------
    numeric_cols = train.select_dtypes(include='number').columns
    scaler = StandardScaler()
    train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
    test[numeric_cols] = scaler.transform(test[numeric_cols])

    return train, y_train, test, train_ids, test_ids

In [12]:
X_train, y_train, X_test, train_ids, test_ids = preprocess_thyroid_data('train.csv', 'test.csv')

In [19]:
import pandas as pd
y_train.value_counts(normalize=True)

Cancer
0    0.880001
1    0.119999
Name: proportion, dtype: float64

In [24]:
# -----------------------
# 모델 학습 함수 (SMOTE + class_weight)
# -----------------------
def train_with_smote(X, y, X_test, n_splits=5):
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    feature_importance = pd.DataFrame()

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n🟢 Fold {fold + 1}")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        # ✅ SMOTE 적용
        sm = SMOTE(random_state=42)
        X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

        model = lgb.LGBMClassifier(
            n_estimators=1000,
            learning_rate=0.02,
            max_depth=5,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            class_weight='balanced',  # ✅ 불균형 대응
            random_state=42
        )

        model.fit(
            X_resampled, y_resampled,
            eval_set=[(X_val, y_val)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=50)
            ]
        )

        val_pred = model.predict(X_val)
        oof_preds[val_idx] = val_pred
        test_preds += model.predict(X_test) / n_splits

        print(classification_report(y_val, val_pred, digits=4))

        # feature importance 저장
        fold_imp = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_,
            'fold': fold + 1
        })
        feature_importance = pd.concat([feature_importance, fold_imp], axis=0)

    overall_f1 = f1_score(y, oof_preds)
    print(f"\n🎯 Overall F1 Score: {overall_f1:.4f}")

    return np.round(test_preds).astype(int), feature_importance

In [25]:
X_train, y_train, X_test, train_ids, test_ids = preprocess_thyroid_data("train.csv", "test.csv")
y_pred, feature_importance = train_with_smote(X_train, y_train, X_test)

# 제출 파일 저장
submission = pd.DataFrame({
    'ID': test_ids,
    'Cancer': y_pred
})
submission.to_csv("submission_smote.csv", index=False)
print("✅ submission_smote.csv 저장 완료")



🟢 Fold 1
[LightGBM] [Info] Number of positive: 61360, number of negative: 61360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3213
[LightGBM] [Info] Number of data points in the train set: 122720, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 50 rounds
[50]	valid_0's binary_logloss: 0.531857
[100]	valid_0's binary_logloss: 0.469028
[150]	valid_0's binary_logloss: 0.43042
[200]	valid_0's binary_logloss: 0.405119
[250]	valid_0's binary_logloss: 0.390555
[300]	valid_0's binary_logloss: 0.379688
[350]	valid_0's binary_logloss: 0.371215
[400]	valid_0's binary_logloss: 0.364408
[450]	valid_0's binary_logloss: 0.359589
[500]	valid_0's binary_logloss: 0.355971
[550]	valid_0's binary_logloss: 0.351936
[600]	valid_0's binary_logloss: 0.349389