In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


<h1><b><center>Imports and configurations

In [2]:
# import basic and foundational libraries and modules
import os
import random
import numpy as np
import pandas as pd

Import scikit-learn modules and libraries required for the project

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

Setting Seed for re-produceability so that random values are not totally upside down on next executions

In [4]:
SEED = 42 # we could set up anything, just picking up a popular number when setting up the seed, but this doesn't have to be 42, it can be setup anything

# define function to set seed from numpy and python default
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)

<h1><b><center>Load the Training and the Test dataset

I am loading this data directly from Kaggle itself. It could be downloaded and worked locally to if planned such way

In [5]:
train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

Quick EDA

In [6]:
# Data shape (rows and columns) of training and test data
print(f"There are {train.shape[0]} number of rows, and {train.shape[1]} columns in the training dataset.")
print(f"There are {test.shape[0]} number of rows, and {test.shape[1]} columns in the test dataset.")

There are 700000 number of rows, and 26 columns in the training dataset.
There are 300000 number of rows, and 25 columns in the test dataset.


In [7]:
# display complete values of rows and columns and the cell
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [8]:
# print the column names
train.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [9]:
# check the quick statistics

# check missing values on the train set
train.isnull().sum()

id                                    0
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0


In [10]:
# check the missing values on the test set
test.isnull().sum()

id                                    0
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0


In [11]:
# check data information
round(train.describe(), 2)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,349999.5,50.36,2.07,80.23,5.96,7.0,6.01,25.87,0.86,116.29,75.44,70.17,186.82,53.82,102.91,123.08,0.15,0.18,0.03,0.62
std,202072.74,11.66,1.05,51.2,1.46,0.9,2.02,2.86,0.04,11.01,6.83,6.94,16.73,8.27,19.02,24.74,0.36,0.39,0.17,0.48
min,0.0,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.68,91.0,51.0,42.0,117.0,21.0,51.0,31.0,0.0,0.0,0.0,0.0
25%,174999.75,42.0,1.0,49.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,175.0,48.0,89.0,106.0,0.0,0.0,0.0,0.0
50%,349999.5,50.0,2.0,71.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0,1.0
75%,524999.25,58.0,3.0,96.0,7.0,7.6,7.4,27.8,0.88,124.0,80.0,75.0,199.0,59.0,116.0,139.0,0.0,0.0,0.0,1.0
max,699999.0,89.0,9.0,747.0,9.9,9.9,16.5,38.4,1.05,163.0,104.0,101.0,289.0,90.0,205.0,290.0,1.0,1.0,1.0,1.0


In [12]:
# check data information of the categorical train data
round(train.describe(include='object'))

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
count,700000,700000,700000,700000,700000,700000
unique,3,5,4,5,3,4
top,Female,White,Highschool,Middle,Never,Employed
freq,363237,386153,344145,290557,494448,516170


<h1><b><center>Create Training Features (X) and target (y)

In [13]:
# identify the target col and ID column to be excluded since it is the unique identifier
target_col = "diagnosed_diabetes"
id_col = "id"

# training features
features = [c for c in train.columns if c not in [target_col, id_col]]
X = train[features]
y = train[target_col]
X_test = test[features]