In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


<h1><b><center>Imports and configurations

In [2]:
# import basic and foundational libraries and modules
import os
import random
import numpy as np
import pandas as pd

Import scikit-learn modules and libraries required for the project

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

Setting Seed for re-produceability so that random values are not totally upside down on next executions

In [4]:
SEED = 42 # we could set up anything, just picking up a popular number when setting up the seed, but this doesn't have to be 42, it can be setup anything

# define function to set seed from numpy and python default
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)

<h1><b><center>Load the Training and the Test dataset

I am loading this data directly from Kaggle itself. It could be downloaded and worked locally to if planned such way

In [5]:
train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

Quick EDA

In [6]:
# Data shape (rows and columns) of training and test data
print(f"There are {train.shape[0]} number of rows, and {train.shape[1]} columns in the training dataset.")
print(f"There are {test.shape[0]} number of rows, and {test.shape[1]} columns in the test dataset.")

There are 700000 number of rows, and 26 columns in the training dataset.
There are 300000 number of rows, and 25 columns in the test dataset.


In [7]:
# display complete values of rows and columns and the cell
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [8]:
# print the column names
train.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [9]:
# check the quick statistics

# check missing values on the train set
train.isnull().sum()

id                                    0
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0


In [10]:
# check the missing values on the test set
test.isnull().sum()

id                                    0
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0


In [11]:
# check data information
round(train.describe(), 2)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,349999.5,50.36,2.07,80.23,5.96,7.0,6.01,25.87,0.86,116.29,75.44,70.17,186.82,53.82,102.91,123.08,0.15,0.18,0.03,0.62
std,202072.74,11.66,1.05,51.2,1.46,0.9,2.02,2.86,0.04,11.01,6.83,6.94,16.73,8.27,19.02,24.74,0.36,0.39,0.17,0.48
min,0.0,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.68,91.0,51.0,42.0,117.0,21.0,51.0,31.0,0.0,0.0,0.0,0.0
25%,174999.75,42.0,1.0,49.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,175.0,48.0,89.0,106.0,0.0,0.0,0.0,0.0
50%,349999.5,50.0,2.0,71.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0,1.0
75%,524999.25,58.0,3.0,96.0,7.0,7.6,7.4,27.8,0.88,124.0,80.0,75.0,199.0,59.0,116.0,139.0,0.0,0.0,0.0,1.0
max,699999.0,89.0,9.0,747.0,9.9,9.9,16.5,38.4,1.05,163.0,104.0,101.0,289.0,90.0,205.0,290.0,1.0,1.0,1.0,1.0


In [12]:
# check data information of the categorical train data
round(train.describe(include='object'))

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
count,700000,700000,700000,700000,700000,700000
unique,3,5,4,5,3,4
top,Female,White,Highschool,Middle,Never,Employed
freq,363237,386153,344145,290557,494448,516170


<h1><b><center>Create Training Features (X) and target (y)

In [13]:
# identify the target col and ID column to be excluded since it is the unique identifier
target_col = "diagnosed_diabetes"
id_col = "id"

# training features
features = [c for c in train.columns if c not in [target_col, id_col]]
X = train[features]
y = train[target_col]
X_test = test[features]

Basic EDA on target column

In [14]:
train[target_col].value_counts(normalize=True)

diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64

In [15]:
# count of different column data types
train[features].dtypes.value_counts()

int64      13
object      6
float64     5
Name: count, dtype: int64

Checking if there is and leakage like columns

In [16]:
train[features].nunique().sort_values().head(20)

cardiovascular_history            2
family_history_diabetes           2
hypertension_history              2
smoking_status                    3
gender                            3
employment_status                 4
education_level                   4
income_level                      5
ethnicity                         5
alcohol_consumption_per_week      9
waist_to_hip_ratio               36
diastolic_bp                     54
heart_rate                       60
sleep_hours_per_day              69
hdl_cholesterol                  69
age                              71
systolic_bp                      71
diet_score                       99
ldl_cholesterol                 151
screen_time_hours_per_day       151
dtype: int64

In [17]:
train[features].nunique().sort_values(ascending=False).head(20)

physical_activity_minutes_per_week    565
bmi                                   231
triglycerides                         228
cholesterol_total                     154
screen_time_hours_per_day             151
ldl_cholesterol                       151
diet_score                             99
age                                    71
systolic_bp                            71
hdl_cholesterol                        69
sleep_hours_per_day                    69
heart_rate                             60
diastolic_bp                           54
waist_to_hip_ratio                     36
alcohol_consumption_per_week            9
ethnicity                               5
income_level                            5
education_level                         4
employment_status                       4
gender                                  3
dtype: int64

<h3><b>Split Categorical and Numerical Columns

Sometimes in the numerical columns, values may be categorical values, just represented as numerical so it is important that we make them categorical

In [35]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [36]:
num_cols

['age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history']

In [37]:
cat_cols

['gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status']

In [38]:
for col in cat_cols:
    X[col] = X[col].astype("category")
    X_test[col] = X_test[col].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

<h1><b><center>Baseline Model: Logistic Regression with Proprocessing in Pipeline</center></b></h1>

This gives us a strong, interpretable reference and checks your CV vs public LB alignment

In [39]:
# Numerical columns
numeric_transformer = Pipeline(steps=[
    ("imputer",  SimpleImputer(strategy="median")),
    ("scaler",   StandardScaler())
])

In [40]:
# categorical columns
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("oh", OneHotEncoder(handle_unknown='ignore'))
])

In [41]:
# Apply Preprocess
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols), 
        ("cat", categorical_transformer, cat_cols),
    ]
)

In [42]:
# Logistic Regression model
log_reg = LogisticRegression(
    max_iter = 1000, 
    n_jobs=-1, 
    class_weight="balanced"
)

In [43]:
log_reg_pipeline = Pipeline(steps=[
    ("preprocess", preprocess), 
    ("model", log_reg)
])

<h3><b>Stratified K-fold Cross-Validation (with ROC-AUC)

In [26]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

oof_preds_lr = np.zeros(len(train))
test_preds_lr = np.zeros(len(test))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold} / {n_splits}")
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    log_reg_pipeline.fit(X_tr, y_tr)
    val_pred = log_reg_pipeline.predict_proba(X_val)[:, 1]
    oof_preds_lr[valid_idx], val_pred

    test_pred = log_reg_pipeline.predict_proba(X_test)[:, 1]
    test_preds_lr += test_pred / n_splits

    fold_auc = roc_auc_score(y_val, val_pred)
    print(f"AUC: {fold_auc:.5f}")

cv_auc_lr = roc_auc_score(y, oof_preds_lr)
print(f"\nOverall CV AUC (Logistic Regression): {cv_auc_lr:.5f}")

Fold 1 / 5
AUC: 0.69528
Fold 2 / 5
AUC: 0.69373
Fold 3 / 5
AUC: 0.69344
Fold 4 / 5
AUC: 0.69481
Fold 5 / 5
AUC: 0.69529

Overall CV AUC (Logistic Regression): 0.50000


<h3>Stronger model: LightGBM with k-fold

Focusing on Tree based model

In [44]:
oof_preds_lgb = np.zeros(len(train))
test_preds_lgb = np.zeros(len(test))

lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.02,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 30,
    "lambda_l1": 0.0,
    "lambda_l2": 0.0,
    "verbose": -1,
    "seed": SEED,
    "feature_pre_filter": False,
}

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}/{n_splits}")
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    # LGBM can handle numeric vs encoded cats; if you have real cat cols, you can pass them as categorical_feature
    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=100),
        ],
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_preds_lgb[valid_idx] = val_pred

    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds_lgb += test_pred / n_splits

    fold_auc = roc_auc_score(y_val, val_pred)
    print(f"  Fold {fold} AUC: {fold_auc:.5f}")

cv_auc_lgb = roc_auc_score(y, oof_preds_lgb)
print(f"\nOverall CV AUC (LightGBM): {cv_auc_lgb:.5f}")

Fold 1/5
Training until validation scores don't improve for 200 rounds
[100]	train's auc: 0.708188	valid's auc: 0.707351
[200]	train's auc: 0.717715	valid's auc: 0.716709
[300]	train's auc: 0.722349	valid's auc: 0.720713
[400]	train's auc: 0.725212	valid's auc: 0.722747
[500]	train's auc: 0.727403	valid's auc: 0.723873
[600]	train's auc: 0.729191	valid's auc: 0.724621
[700]	train's auc: 0.730816	valid's auc: 0.725262
[800]	train's auc: 0.732237	valid's auc: 0.72561
[900]	train's auc: 0.73366	valid's auc: 0.726069
[1000]	train's auc: 0.735023	valid's auc: 0.72638
[1100]	train's auc: 0.736254	valid's auc: 0.726603
[1200]	train's auc: 0.737504	valid's auc: 0.726853
[1300]	train's auc: 0.738647	valid's auc: 0.727031
[1400]	train's auc: 0.739806	valid's auc: 0.727206
[1500]	train's auc: 0.740938	valid's auc: 0.727371
[1600]	train's auc: 0.74206	valid's auc: 0.727473
[1700]	train's auc: 0.743211	valid's auc: 0.727611
[1800]	train's auc: 0.744342	valid's auc: 0.727745
[1900]	train's auc: 0.74

<h3>Ensembling the models

In [45]:
oof_blend = 0.3 * oof_preds_lr + 0.7 * oof_preds_lgb
test_blend = 0.3 * test_preds_lr + 0.7 * test_preds_lgb

cv_auc_blend = roc_auc_score(y, oof_blend)
print(f"Blended CV AUC: {cv_auc_blend:.5f}")

Blended CV AUC: 0.72816


<h1><b>Create Submission File

In [46]:
sample_sub = pd.read_csv("/kaggle/input/playground-series-s5e12/sample_submission.csv")
sample_sub[target_col] = test_blend
sample_sub.to_csv("submission.csv", index=False)

In [3]:
import os

In [4]:
os.listdir("/kaggle/working")

['.virtual_documents']