In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [13]:
df = pd.read_csv("smoking_drinking_train.csv")

X = df.drop(['DRK_YN'], axis=1)
y = df['DRK_YN'].map({'Y': 1, 'N': 0})

# Preprocessing Pipeline

In [15]:
ordinal_cols = [
    "hear_left", 
    "hear_right", 
    "urine_protein", 
    "SMK_stat_type_cd",
    ]

nominal_cols = [
    "sex",
    "SMK_stat_type_cd",
    ]

numeric_cols = [
    "age",
    "height",
    "weight",
    "waistline",
    "sight_left",
    "sight_right",
    "SBP",
    "DBP",
    "BLDS",
    "tot_chole",
    "HDL_chole",
    "LDL_chole",
    "triglyceride",
    "hemoglobin",
    "serum_creatinine",
    "SGOT_AST",
    "SGOT_ALT",
    "gamma_GTP",
]

In [16]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ])

In [17]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

In [20]:
xgb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 3, 7],
}

knn_param_grid = {
    'classifier__n_neighbors': list(range(5, 25, 10)),
    'classifier__weights': ['uniform', 'distance'],
}

xgb_search = GridSearchCV(xgb_pipeline, xgb_param_grid, n_jobs=-1, cv=10, scoring='roc_auc')
knn_search = GridSearchCV(knn_pipeline, knn_param_grid, n_jobs=-1, cv=10, scoring='roc_auc')

In [19]:
# Fit and find best XGBoost model
xgb_search.fit(X, y)
best_xgb_model = xgb_search.best_estimator_
best_xgb_params = xgb_search.best_params_



KeyboardInterrupt: 

In [None]:
print(f"Best XGBoost model: {best_xgb_model}")
print(f"Best XGBoost parameters: {best_xgb_params}")

In [None]:
knn_search.fit(X, y)
best_knn_model = knn_search.best_estimator_
best_knn_params = knn_search.best_params_

In [None]:
print(f"Best KNN model: {best_knn_model}")
print(f"Best KNN parameters: {best_knn_params}")

In [None]:
import joblib
joblib.dump(best_xgb_model, 'xgb_model.joblib')
joblib.dump(best_knn_model, 'knn_model.joblib')