In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
from tqdm.notebook import tqdm
import logging
import time

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from bisect import bisect_left
from scipy.stats import rankdata
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import optuna
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RANDOM_STATE = 5
REPEATS = 3
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

In [4]:
train = pd.read_csv(r'train.csv (1)\train.csv')
# test = pd.read_csv('../input/playground-series-s3e24/test.csv')

In [5]:
X = train.drop(['id', 'smoking'], axis=1)
y = train['smoking']

# X_test = test.drop('id', axis=1)

# Preprocessing

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  159256 non-null  int64  
 1   height(cm)           159256 non-null  int64  
 2   weight(kg)           159256 non-null  int64  
 3   waist(cm)            159256 non-null  float64
 4   eyesight(left)       159256 non-null  float64
 5   eyesight(right)      159256 non-null  float64
 6   hearing(left)        159256 non-null  int64  
 7   hearing(right)       159256 non-null  int64  
 8   systolic             159256 non-null  int64  
 9   relaxation           159256 non-null  int64  
 10  fasting blood sugar  159256 non-null  int64  
 11  Cholesterol          159256 non-null  int64  
 12  triglyceride         159256 non-null  int64  
 13  HDL                  159256 non-null  int64  
 14  LDL                  159256 non-null  int64  
 15  hemoglobin       

In [7]:
intial_cat_features = ['dental caries', 'hearing(left)', 'hearing(right)', 'Urine protein']
num_features = [col for col in X.columns if col not in intial_cat_features]

In [8]:
# for feat in intial_cat_features:
#     X[feat] = X[feat].astype(str)

In [9]:
# X.info()

In [10]:
X.shape

(159256, 22)

# Cross-Validation

In [12]:
%%time

# Initialize lists for scores
hist_scores, lgbm_scores, xgb_scores, cat_scores = [], [], [], []
ada_scores, grad_scores, logreg_scores, ridge_scores = [], [], [], []
gnb_scores, lda_scores = [], []
     
for i, (train_idx, test_idx) in enumerate(sk.split(X, y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    print('----------------------------------------------------------')

    ##########################
    ## HistGradientBoosting ##
    ##########################

    start_time = time.time()
    hist = HistGradientBoostingClassifier(random_state=5).fit(X_train, y_train)
    
    hist_train_pred = hist.predict_proba(X_train)[:, 1]
    hist_pred = hist.predict_proba(X_test)[:, 1]
    hist_score = roc_auc_score(y_test, hist_pred)
    hist_scores.append(hist_score)
    
    print(f'Fold {i + 1} ==> HistGradientBoosting ROC-AUC score is ==> {hist_score}')
    print(f'{time.time() - start_time}')
          
    ##########
    ## LGBM ##
    ##########

    start_time = time.time()
    lgbm = LGBMClassifier(n_jobs=-1, random_state=5).fit(X_train, y_train)

    lgbm_train_pred = lgbm.predict_proba(X_train)[:, 1]
    lgbm_pred = lgbm.predict_proba(X_test)[:, 1]
    lgbm_score = roc_auc_score(y_test, lgbm_pred)
    lgbm_scores.append(lgbm_score)
    
    print(f'Fold {i + 1} ==> LGBM ROC-AUC score is ==> {lgbm_score}')
    print(f'{time.time() - start_time}')

    #############
    ## XGBoost ##
    #############

    start_time = time.time()
    xgb = XGBClassifier(seed=5).fit(X_train, y_train)

    xgb_train_pred = xgb.predict_proba(X_train)[:, 1]
    xgb_pred = xgb.predict_proba(X_test)[:, 1]
    xgb_score = roc_auc_score(y_test, xgb_pred)
    xgb_scores.append(xgb_score)
    
    print(f'Fold {i + 1} ==> XGBoost ROC-AUC score is ==> {xgb_score}')
    print(f'{time.time() - start_time}')

    ##############
    ## CatBoost ##
    ##############

    start_time = time.time()
    cat = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100, cat_features=intial_cat_features).fit(X_train, y_train)

    cat_train_pred = cat.predict_proba(X_train)[:, 1]
    cat_pred = cat.predict_proba(X_test)[:, 1]
    cat_score = roc_auc_score(y_test, cat_pred)
    cat_scores.append(cat_score)
    
    print(f'Fold {i + 1} ==> CatBoost ROC-AUC score is ==> {cat_score}')
    print(f'{time.time() - start_time}')

    ########################
    ## AdaBoostClassifier ##
    ########################
    
    start_time = time.time()
    ada = AdaBoostClassifier(random_state=5).fit(X_train, y_train)
    ada_pred = ada.predict_proba(X_test)[:, 1]
    ada_score = roc_auc_score(y_test, ada_pred)
    ada_scores.append(ada_score)

    print(f'Fold {i + 1} ==> AdaBoostClassifier ROC-AUC score is ==> {ada_score}')
    print(f'{time.time() - start_time}')
    
    ################################
    ## GradientBoostingClassifier ##
    ################################
    
    start_time = time.time()
    grad = GradientBoostingClassifier(random_state=5).fit(X_train, y_train)
    grad_pred = grad.predict_proba(X_test)[:, 1]
    grad_score = roc_auc_score(y_test, grad_pred)
    grad_scores.append(grad_score)

    print(f'Fold {i + 1} ==> GradientBoostingClassifier ROC-AUC score is ==> {grad_score}')
    print(f'{time.time() - start_time}')
    
    ########################
    ## LogisticRegression ##
    ########################
    
    start_time = time.time()
    logreg = LogisticRegression(random_state=5).fit(X_train, y_train)
    logreg_pred = logreg.predict_proba(X_test)[:, 1]
    logreg_score = roc_auc_score(y_test, logreg_pred)
    logreg_scores.append(logreg_score)

    print(f'Fold {i + 1} ==> LogisticRegression ROC-AUC score is ==> {logreg_score}')
    print(f'{time.time() - start_time}')
    
    #####################
    ## RidgeClassifier ##
    #####################
    
    start_time = time.time()
    ridge = RidgeClassifier(random_state=5).fit(X_train, y_train)
    ridge_pred = ridge.decision_function(X_test)
    ridge_score = roc_auc_score(y_test, ridge_pred)
    ridge_scores.append(ridge_score)

    print(f'Fold {i + 1} ==> RidgeClassifier ROC-AUC score is ==> {ridge_score}')
    print(f'{time.time() - start_time}')
    
    ################
    ## GaussianNB ##
    ################
    
    start_time = time.time()
    gnb = GaussianNB().fit(X_train, y_train)
    gnb_pred = gnb.predict_proba(X_test)[:, 1]
    gnb_score = roc_auc_score(y_test, gnb_pred)
    gnb_scores.append(gnb_score)

    print(f'Fold {i + 1} ==> GaussianNB ROC-AUC score is ==> {gnb_score}')
    print(f'{time.time() - start_time}')
    
    ################################
    ## LinearDiscriminantAnalysis ##
    ################################
    
    start_time = time.time()
    lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
    lda_pred = lda.predict_proba(X_test)[:, 1]
    lda_score = roc_auc_score(y_test, lda_pred)
    lda_scores.append(lda_score)

    print(f'Fold {i + 1} ==> LinearDiscriminantAnalysis ROC-AUC score is ==> {lda_score}')
    print(f'{time.time() - start_time}')

    print()
    
print('----------------------------------------------------------')
print(f'Average Hist Score is ==> {np.mean(hist_scores)}')
print(f'Average LGBM Score is ==> {np.mean(lgbm_scores)}')
print(f'Average XGB Score is ==> {np.mean(xgb_scores)}')
print(f'Average CAT Score is ==> {np.mean(cat_scores)}')
print(f'Average AdaBoost Score: {np.mean(ada_scores)}')
print(f'Average GradientBoosting Score: {np.mean(grad_scores)}')
print(f'Average Logistic Regression Score: {np.mean(logreg_scores)}')
print(f'Average Ridge Classifier Score: {np.mean(ridge_scores)}')
print(f'Average GaussianNB Score: {np.mean(gnb_scores)}')
print(f'Average LDA Score: {np.mean(lda_scores)}')

----------------------------------------------------------
Fold 1 ==> HistGradientBoosting ROC-AUC score is ==> 0.8604118601574998
3.4549896717071533
Fold 1 ==> LGBM ROC-AUC score is ==> 0.8600823579324062
1.5238573551177979
Fold 1 ==> XGBoost ROC-AUC score is ==> 0.8620344495329355
8.906520128250122
Fold 1 ==> CatBoost ROC-AUC score is ==> 0.865516997739018
115.4750566482544
Fold 1 ==> AdaBoostClassifier ROC-AUC score is ==> 0.8463182059672492
7.8600499629974365
Fold 1 ==> GradientBoostingClassifier ROC-AUC score is ==> 0.8540488991812272
31.642473936080933
Fold 1 ==> LogisticRegression ROC-AUC score is ==> 0.8127885027018991
1.1821153163909912
Fold 1 ==> RidgeClassifier ROC-AUC score is ==> 0.8254452910309256
0.11876678466796875
Fold 1 ==> GaussianNB ROC-AUC score is ==> 0.7862033159786309
0.10053730010986328
Fold 1 ==> LinearDiscriminantAnalysis ROC-AUC score is ==> 0.8254451468356507
0.48094940185546875

----------------------------------------------------------
Fold 2 ==> HistGrad