### Import

In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

from sklearn.metrics import f1_score

### Data Load

In [25]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

### Data Preprocessing

In [26]:
X = train.drop(columns=['ID', 'Cancer'])
y = train['Cancer']

x_test = test.drop('ID', axis=1)

In [27]:
categorical_features = [col for col in X.columns if X[col].dtype == 'object']
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    # for val in np.unique(x_test[col]):
    #     if val not in le.classes_:
    #         le.classes_ = np.append(le.classes_, val)
    x_test[col] = le.transform(x_test[col])

In [37]:
# T3/T4
# N_size/TSH
X["T3_T4_Ratio"] = X["T3_Result"] / X["T4_Result"]
X["N_size_TSH_Ratio"] = X["Nodule_Size"] / X["TSH_Result"]

x_test["T3_T4_Ratio"] = x_test["T3_Result"] / x_test["T4_Result"]
x_test["N_size_TSH_Ratio"] = x_test["Nodule_Size"] / x_test["TSH_Result"]


### Train

In [None]:
def train_and_eval(X_tr, y_tr, X_val, y_val, label):
    model = XGBClassifier(random_state=42)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"[{label}] Validation F1-score: {f1:.4f}")
    return model, f1

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [39]:
# (1) SMOTE 미적용
model_raw, f1_raw = train_and_eval(X_train, y_train, X_val, y_val, "RAW")

# (2) SMOTE 적용
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
model_smote, f1_smote = train_and_eval(X_train_smote, y_train_smote, X_val, y_val, "SMOTE")

[RAW] Validation F1-score: 0.2972




[SMOTE] Validation F1-score: 0.3215


In [40]:
# SMOTE 적용 여부에 따라 최종 학습 데이터 구성
if f1_smote >= f1_raw:
    smote_full = SMOTE(random_state=42)
    X_final, y_final = smote_full.fit_resample(X, y)
else:
    X_final, y_final = X, y

# 최종 모델 학습
final_model = XGBClassifier(random_state=42)
final_model.fit(X_final, y_final)




In [34]:
print(len(X[y==1]))
print(len(X[y==0]))
print(len(X))

print("------------------------------------")
print(len(X_final[y_final==1]))
print(len(X_final[y_final==0]))
print(len(X_final))

10459
76700
87159
------------------------------------
76700
76700
153400


### Val

In [41]:
val_result = final_model.predict(X_val)
score = f1_score(val_result, y_val)
print(score)

0.4023952095808383


### Predict

In [42]:
final_pred = final_model.predict(x_test)
print(final_pred)

[0 0 1 ... 0 0 0]


### Submission

In [43]:
submission = pd.read_csv('sample_submission.csv')

In [44]:
submission['Cancer'] = final_pred

In [45]:
submission.to_csv('feature_submit.csv', index=False)