In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Use smaller training set for hyperparameter tuning

In [3]:
df = pd.read_csv("smoking_drinking_hyperparams.csv")

X_train = df.drop(['DRK_YN'], axis=1)
y_train = df['DRK_YN'].map({'Y': 1, 'N': 0})

## Preprocessing Pipeline

In [4]:
ordinal_cols = [
    "hear_left", 
    "hear_right", 
    "urine_protein", 
    "SMK_stat_type_cd",
    ]

nominal_cols = [
    "sex",
    "SMK_stat_type_cd",
    ]

numeric_cols = [
    "age",
    "height",
    "weight",
    "waistline",
    "sight_left",
    "sight_right",
    "SBP",
    "DBP",
    "BLDS",
    "tot_chole",
    "HDL_chole",
    "LDL_chole",
    "triglyceride",
    "hemoglobin",
    "serum_creatinine",
    "SGOT_AST",
    "SGOT_ALT",
    "gamma_GTP",
]

In [5]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ])

In [6]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

In [7]:
xgb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 3, 7],
}

knn_param_grid = {
    'classifier__n_neighbors': list(range(5, 25, 10)),
    'classifier__weights': ['uniform', 'distance'],
}

xgb_search = GridSearchCV(xgb_pipeline, xgb_param_grid, n_jobs=-1, cv=5, scoring='accuracy')
knn_search = GridSearchCV(knn_pipeline, knn_param_grid, n_jobs=-1, cv=5, scoring='accuracy')

In [8]:
xgb_search.fit(X_train, y_train)
best_xgb_model = xgb_search.best_estimator_
best_xgb_params = xgb_search.best_params_

In [20]:
print(f"Best XGBoost parameters: {best_xgb_params}")

Best XGBoost parameters: {'classifier__max_depth': 3, 'classifier__n_estimators': 300}


In [10]:
knn_search.fit(X_train, y_train)
best_knn_model = knn_search.best_estimator_
best_knn_params = knn_search.best_params_

In [11]:
print(f"Best KNN parameters: {best_knn_params}")

Best KNN parameters: {'classifier__n_neighbors': 15, 'classifier__weights': 'distance'}


# Fit models with best parameters to entire training dataset

In [21]:
df = pd.read_csv("smoking_drinking_train.csv")

X = df.drop(['DRK_YN'], axis=1)
y = df['DRK_YN'].map({'Y': 1, 'N': 0})

In [22]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ])

In [23]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(max_depth=3, n_estimators=300))
])

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=15, weights='distance'))
])

In [24]:
xgb_pipeline.fit(X, y)

In [25]:
knn_pipeline.fit(X, y)

# Save the models

In [18]:
import joblib

In [26]:
joblib.dump(xgb_pipeline, 'xgb_pipeline.joblib')
joblib.dump(knn_pipeline, 'knn_pipeline.joblib')

['knn_pipeline.joblib']