## Import

In [None]:
import os
import torch
import random

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
from xgboost import XGBClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train_path = '/content/train.csv'
test_path = '/content/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

submission = pd.DataFrame()
submission["id"] = test_df['id']

print(train_df.shape)
print(test_df.shape)

## Data

In [None]:
class data_processing() :
    def __init__(self) :
        self.columns_to_drop = ['id']

        self.label_idx = ['smoking']
        self.test_size = 0.2

    def preprocessing(self, data_df) :
        data_df = data_df.drop(columns=self.columns_to_drop, axis=1)
        # data_df.fillna('NAN', inplace = True)

        return data_df

    def split_data(self, data_df) :
        train_df, val_df = train_test_split(data_df, test_size = self.test_size, random_state = 42)

        X_train = train_df.drop(columns=self.label_idx, axis = 1)
        y_train = train_df[self.label_idx]

        X_val = val_df.drop(columns=self.label_idx, axis = 1)
        y_val = val_df[self.label_idx]

        return X_train, y_train, X_val, y_val

In [None]:
df = data_processing()

train_df = df.preprocessing(train_df)
test_df = df.preprocessing(test_df)

X_train, y_train, X_val, y_val = df.split_data(train_df)

In [None]:
print("train_df : ", train_df.shape)
print("test_df : ", test_df.shape)

print("X_train : ", X_train.shape)
print("y_train : ", y_train.shape)
print("X_val : ", X_val.shape)
print("y_val : ", y_val.shape)

## Model

In [None]:
xgbmodel = XGBClassifier(random_state = 42,
                        learning_rate = 0.1,
                        n_estimators = 100,
                        objective="binary:logistic")

## Train & Validation

In [None]:
xgbmodel.fit(X_train, y_train)

In [None]:
def validation(model, X, y) :
    pred = model.predict_proba(X)[:, 1]

    auc = roc_auc_score(y, pred)
    print(f"AUC: {auc:.4f}")
    print("\n")

    fpr, tpr, thresholds = roc_curve(y, pred)

    plt.figure(figsize=(10, 7))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlabel('False Positive Rate (FPR)', fontsize=12)
    plt.ylabel('True Positive Rate (TPR)', fontsize=12)
    plt.title('ROC Curve', fontsize=15)
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

validation(xgbmodel, X_val, y_val)

## Submission

In [None]:
pred = xgbmodel.predict_proba(test_df)[:,1]

In [None]:
submission["smoking"] = pred
submission.to_csv("submission.csv",header=True,index=False)