<a href="https://www.kaggle.com/code/ishutrivedi/lightgbm-with-hyper-parameter-tuning?scriptVersionId=136248991" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

*Here we will be using LightGBM with WiDS Datathon 2021 dataset and then trying hyper-parameter tuning to find the best suited model.*

**Installation**

In [1]:
pip install lightgbm

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


**Import the Libraries**

In [2]:
import numpy as np 
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from numpy import isnan
from sklearn.preprocessing import MinMaxScaler
import lightgbm
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Load the Data

In [3]:
df = pd.read_csv("/kaggle/input/widsdatathon2021/TrainingWiDS2021.csv")
print(df.shape)
test = pd.read_csv("/kaggle/input/widsdatathon2021/UnlabeledWiDS2021.csv")
print(test.shape)

(130157, 181)
(10234, 180)


# EDA Using Pandas Profiling

In [4]:
#trainprofile = ProfileReport(df)
#trainprofile

# Data Pre-Processing

**Handling Categorical Columns**

In [5]:
#Find categorical columns
cols = df.columns
num_cols = df._get_numeric_data().columns
list(set(cols) - set(num_cols))

#one hot encoding for categorical columns
df = pd.get_dummies(df, columns=["ethnicity","icu_type","gender","icu_admit_source", "icu_stay_type", "hospital_admit_source"])
test = pd.get_dummies(test, columns=["ethnicity","icu_type","gender","icu_admit_source", "icu_stay_type", "hospital_admit_source"])
test['hospital_admit_source_ICU'] = 0
test['hospital_admit_source_Other'] = 0
test['hospital_admit_source_PACU'] = 0
test['hospital_admit_source_Acute Care/Floor'] = 0
test['hospital_admit_source_Observation'] = 0
print(df.shape)
print(test.shape)
temp = test["encounter_id"]

(130157, 214)
(10234, 213)


**Handling Missing Values using Imputer**

In [6]:
## Using SimpleInputer with different strategies
#imputer = SimpleImputer(strategy='mean') #roc_auc: 0.824
#imputer = SimpleImputer(strategy='median') #roc_auc: 0.820
#imputer = SimpleImputer(strategy='most_frequent') #roc_auc: 0.827
imputer = SimpleImputer(strategy='constant') #roc_auc: 0.822

# fit on the dataset
transformed_values = imputer.fit_transform(df)
transformed_values_test = imputer.fit_transform(test)
cols = df.columns
test_cols = test.columns

# print total missing
#print('Missing: %d' % sum(isnan(transformed_values).flatten()))

# convert array into dataframe 
df = pd.DataFrame(transformed_values, columns = cols)
test = pd.DataFrame(transformed_values_test, columns = test_cols)
print(test.shape)

(10234, 213)


**Normalization**

In [7]:

#Normalization for training data set
scaler = MinMaxScaler()
scaler.fit(df)
normalized = scaler.transform(df)
normalized_df = pd.DataFrame(data = normalized, columns = cols)
normalized_df.shape

#Normalization for test data set
scaler = MinMaxScaler()
scaler.fit(test)
normalized = scaler.transform(test)
test = pd.DataFrame(data = normalized, columns = test_cols)
print(test.shape)


(10234, 213)


# Defining Training Data

In [8]:
'''
df_bkp = normalized_df
train_Y = normalized_df['diabetes_mellitus']
train_X = normalized_df.drop(columns=['diabetes_mellitus'])
'''

train_Y = df['diabetes_mellitus']
train_X = df.drop(columns=['diabetes_mellitus'])
test = test[train_X.columns]
train_x, validation_x, train_y, validation_y = train_test_split(train_X, train_Y, test_size=0.20)
train_x.shape, validation_x.shape, train_y.shape, validation_y.shape

((104125, 213), (26032, 213), (104125,), (26032,))

# LightGBM

In [9]:
'''
parameters = { #0.857115
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 2,
    'max_depth': 3
}
'''
params8576_it535 = {'learning_rate': 0.15037582874165212, 'boosting_type': 'dart', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.8404385681314072, 'num_leaves': 68, 'min_data': 10, 'max_depth': 7, 'verbose': 2}
params8556_it148 = {'learning_rate': 0.39830917424911405, 'boosting_type': 'dart', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.6831231853371216, 'num_leaves': 81, 'min_data': 87, 'max_depth': 5, 'verbose': 2}
params8573_it225 = {'learning_rate': 0.08042319639877527, 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.355412108762454, 'num_leaves': 100, 'min_data': 83, 'max_depth': -1, 'verbose': 2}
params_bayes_best1 = {'n_estimators': 1000, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'subsample': 0.8766726297701315, 'subsample_freq': 21, 'learning_rate': 0.08490981197299435, 'feature_fraction': 0.43381916030656353, 'max_depth': 36, 'lambda_l1': 19.92219967865967, 'lambda_l2': 5.770832463575476, 'scale_pos_weight': 6, 'min_data_in_leaf': 304}

params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.008,
        'subsample': 1,
        'colsample_bytree': 0.2,
        'reg_alpha': 3,
        'reg_lambda': 1,
        'scale_pos_weight': 5,
        'n_estimators': 5000,
        'verbose': 1,
        'max_depth': -1,
        'seed':100, 
        'force_col_wise': True
}
import re
train_X = train_X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
validation_x = validation_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
d_train = lightgbm.Dataset(train_X, label=train_Y)
test_data = lightgbm.Dataset(validation_x, label=validation_y)

#model_lgbm = lightgbm.train(parameters,
 #                      d_train,
  #                     valid_sets=test_data,
   #                    num_boost_round=900,
    #                   early_stopping_rounds=50)
#model_lgbm = lightgbm.train(params8576_it535, d_train, 535)
model_lgbm = lightgbm.train(params8573_it225, d_train, 250)
#model_lgbm = lightgbm.train(params8556_it148, d_train, 148)
print("LightGBM : On validation set, the ROC AUC score is ",roc_auc_score(validation_y, model_lgbm.predict(validation_x)))
'''
d_train = lightgbm.Dataset(train_x, label=train_y)
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 50
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count
    d_train = lightgbm.Dataset(train_x, label=train_y) #Load in data
    params = {} #initialize parameters
    params['learning_rate'] = np.random.uniform(0, 1)
    params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['sub_feature'] = np.random.uniform(0, 1)
    params['num_leaves'] = np.random.randint(20, 150)
    params['min_data'] = np.random.randint(10, 100)
    params['max_depth'] = np.random.randint(5, 20)
    params['verbose'] = 2
    iterations = np.random.randint(10, 1000)
    print(params, iterations)
    model_lgbm = lightgbm.train(params, d_train, iterations)
    print("LightGBM : On validation set, the ROC AUC score is ",roc_auc_score(validation_y, model_lgbm.predict(validation_x)))
'''

[LightGBM] [Info] Number of positive: 28151, number of negative: 102006
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.808104
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.446536
[LightGBM] [Debug] init for col-wise cost 0.061073 seconds, init for row-wise cost 0.164818 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 26483
[LightGBM] [Info] Number of data points in the train set: 130157, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216285 -> initscore=-1.287449
[LightGBM] [Info] Start training from score -1.287449
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 13
[LightGBM] [Debug] Trained

'\nd_train = lightgbm.Dataset(train_x, label=train_y)\ncount = 0 #Used for keeping track of the iteration number\n#How many runs to perform using randomly selected hyperparameters\niterations = 50\nfor i in range(iterations):\n    print(\'iteration number\', count)\n    count += 1 #increment count\n    d_train = lightgbm.Dataset(train_x, label=train_y) #Load in data\n    params = {} #initialize parameters\n    params[\'learning_rate\'] = np.random.uniform(0, 1)\n    params[\'boosting_type\'] = np.random.choice([\'gbdt\', \'dart\', \'goss\'])\n    params[\'objective\'] = \'binary\'\n    params[\'metric\'] = \'auc\'\n    params[\'sub_feature\'] = np.random.uniform(0, 1)\n    params[\'num_leaves\'] = np.random.randint(20, 150)\n    params[\'min_data\'] = np.random.randint(10, 100)\n    params[\'max_depth\'] = np.random.randint(5, 20)\n    params[\'verbose\'] = 2\n    iterations = np.random.randint(10, 1000)\n    print(params, iterations)\n    model_lgbm = lightgbm.train(params, d_train, i

# LGBMClassifier

In [10]:
'''
clf = lightgbm.LGBMClassifier(params8573_it225)
clf.fit(train_X, train_Y,
        eval_set=[(train_X, train_Y), (validation_x, validation_y)],
        verbose=100,
        early_stopping_rounds=None)
print("LightGBM : On validation set, the ROC AUC score is ",roc_auc_score(validation_y, clf.predict(validation_x)))
'''

'\nclf = lightgbm.LGBMClassifier(params8573_it225)\nclf.fit(train_X, train_Y,\n        eval_set=[(train_X, train_Y), (validation_x, validation_y)],\n        verbose=100,\n        early_stopping_rounds=None)\nprint("LightGBM : On validation set, the ROC AUC score is ",roc_auc_score(validation_y, clf.predict(validation_x)))\n'

# Pseudo Labelling

*Tried this approach but after making a submission post this, it was found that the model was overfitting. On training dataset roc_auc_score is 0.96 but submission score comes to ~0.79*

In [11]:
pseudo_labels = model_lgbm.predict(test)
augmented_test = test.copy(deep=True)
augmented_test["diabetes_mellitus"] = pseudo_labels.astype(int)
augmented_test.shape
train_new = df.append(augmented_test, ignore_index=True)
train_new.shape
train_newY = train_new["diabetes_mellitus"]
train_newX = train_new.drop(columns=['diabetes_mellitus'])
train_newX.shape, train_newY.shape
train_x, validation_x, train_y, validation_y = train_test_split(train_newX, train_newY,test_size=0.20)
train_x.shape, validation_x.shape, train_y.shape, validation_y.shape
train_newX = train_newX.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
validation_x = validation_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
d_train = lightgbm.Dataset(train_newX, label=train_newY)
test_data = lightgbm.Dataset(validation_x, label=validation_y)
model_lgbm = lightgbm.train(params8573_it225, d_train, 250)
#validation_y=validation_y.round(0)
#validation_y=validation_y.astype(int)
print("LightGBM : On validation set, the ROC AUC score is ",roc_auc_score(validation_y, model_lgbm.predict(validation_x)))

[LightGBM] [Info] Number of positive: 28151, number of negative: 112240
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.810506
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.442559
[LightGBM] [Debug] init for col-wise cost 0.062020 seconds, init for row-wise cost 0.179918 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 31851
[LightGBM] [Info] Number of data points in the train set: 140391, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.200519 -> initscore=-1.383057
[LightGBM] [Info] Start training from score -1.383057
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 100 and max_depth = 13
[LightGBM] [Debug] Trained

# Submission

In [12]:
test["diabetes_mellitus"] = model_lgbm.predict(test)

test["encounter_id"] = temp
test[["encounter_id","diabetes_mellitus"]].to_csv("submission.csv",index=False)