### Import

In [47]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score

### Data Load

In [48]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

### Data Preprocessing

In [49]:
X = train.drop(columns=['ID', 'Cancer'])
y = train['Cancer']

x_test = test.drop('ID', axis=1)

In [50]:
for df in [X, x_test]:
    df['Age'] = df['Age'] //10
    df["Family_Background"] = df["Family_Background"].map({"Positive": 1, "Negative": 0})
    df["Gender"] = df["Gender"].map({"M": 0, "F": 1})
    df["Radiation_History"] = df["Radiation_History"].map({"Exposed": 1, "Unexposed": 0})
    df["Iodine_Deficiency"] = df["Iodine_Deficiency"].map({"Deficient": 1, "Sufficient": 0})
    df["Smoke"] = df["Smoke"].map({"Smoker": 1, "Non-Smoker": 0})
    df["Weight_Risk"] = df["Weight_Risk"].map({"Obese": 1, "Not Obese": 0})
    df["Diabetes"] = df["Diabetes"].map({"Yes": 1, "No": 0})

categorical_features = ['Country', 'Race']
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    x_test[col] = le.transform(x_test[col])

In [51]:
# T3/T4
# N_size/TSH
for df in [X, x_test]:
    df["T3_T4_Ratio"] = df["T3_Result"] / df["T4_Result"]
    df["N_size_TSH_Ratio"] = df["Nodule_Size"] / df["TSH_Result"]
    df["Sum_Hormone"] = df["T3_Result"] + df["T4_Result"] + df["TSH_Result"]
    df["Risk_Score"] = df["Family_Background"] + df["Radiation_History"] + df["Iodine_Deficiency"] + df["Smoke"] + df["Diabetes"]
    df["Nodule_Size_Group"] = (df["Nodule_Size"] // 1).astype(int)


### Train

In [52]:
def train_and_eval(X_tr, y_tr, X_val, y_val, label):
    model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        num_leaves=31,
        max_depth=6,
        class_weight='balanced',
        random_state=42
    )      
    model.fit(X_tr, y_tr)  
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"[{label}] Validation F1-score: {f1:.4f}")
    return model, f1

### SMOTE

In [53]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [54]:
# (1) SMOTE 미적용
model_raw, f1_raw = train_and_eval(X_train, y_train, X_val, y_val, "RAW")

print(f"f1 raw : {f1_raw}")


[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1832
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[RAW] Validation F1-score: 0.4669
f1 raw : 0.4669024759979788


In [55]:
X_final, y_final = X, y

# 최종 모델 학습

final_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    class_weight='balanced',
    random_state=42
)
final_model.fit(X_final, y_final)


[LightGBM] [Info] Number of positive: 10459, number of negative: 76700
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1833
[LightGBM] [Info] Number of data points in the train set: 87159, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


### Val

In [56]:
val_result = final_model.predict(X_val)
score = f1_score(val_result, y_val)
print(score)

0.47364438839848677


### Predict

In [57]:
final_pred = final_model.predict(x_test)
print(list(final_pred).count(1))

5811


In [43]:
submission = pd.read_csv('LGBM_nodule_size_group.csv')
print(list(submission['Cancer']).count(0))
print(list(submission['Cancer']).count(1))

40393
5811


### Submission

In [44]:
submission = pd.read_csv('sample_submission.csv')

In [45]:
submission['Cancer'] = final_pred

In [46]:
submission.to_csv('new.csv', index=False)