## A) Install library

In [3]:
#!pip install pycaret

In [4]:
#!pip install pandas-profiling

In [1]:
import pandas as pd
from pycaret.classification import * 

## B) Load dataset and analyze

In [2]:
data = pd.read_csv('Dataset.csv')

In [11]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 186 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   encounter_id                   91713 non-null  int64  
 1   patient_id                     91713 non-null  int64  
 2   hospital_id                    91713 non-null  int64  
 3   hospital_death                 91713 non-null  int64  
 4   age                            87485 non-null  float64
 5   bmi                            88284 non-null  float64
 6   elective_surgery               91713 non-null  int64  
 7   ethnicity                      90318 non-null  object 
 8   gender                         91688 non-null  object 
 9   height                         90379 non-null  float64
 10  hospital_admit_source          70304 non-null  object 
 11  icu_admit_source               91601 non-null  object 
 12  icu_id                         91713 non-null

## C) Setting up environment (pycaret)

In [3]:
cat_features = ['ethnicity','gender','hospital_admit_source','icu_admit_source','icu_stay_type','icu_type',
                'apache_3j_bodysystem','apache_2_bodysystem'] 

In [5]:
clf = setup(data, target='hospital_death', categorical_features=cat_features,
            preprocess = True, normalize = True, profile = True,
            pca = True, pca_components = 0.95,
            feature_selection = True, feature_selection_threshold = 0.75,
            session_id = 123, n_jobs = -1, use_gpu = True, silent = True,
            log_experiment = True)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

PicklingError: ignored



## E) Compare Models

In [8]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [7]:
from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\nTime taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 3)))

In [9]:
%%time 
start_time = timer(None)
moedlss = compare_models(include = ['lr','rf','ada','gbc','lightgbm'] )
timer(start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9264,0.8737,0.2764,0.6885,0.3938,0.3622,0.4057,5.303
gbc,Gradient Boosting Classifier,0.9261,0.8643,0.2367,0.7269,0.3564,0.3279,0.3873,209.196
lr,Logistic Regression,0.9257,0.8713,0.2741,0.6756,0.3896,0.3575,0.3994,0.796
rf,Random Forest Classifier,0.9237,0.8395,0.1621,0.7913,0.2687,0.2466,0.3363,91.164
ada,Ada Boost Classifier,0.9203,0.8464,0.2682,0.59,0.3682,0.332,0.3619,40.652



Time taken: 0 hours 57 minutes and 58.644 seconds.
CPU times: user 44min 16s, sys: 11.6 s, total: 44min 28s
Wall time: 57min 58s


In [15]:
# All training functions (create_model, tune_model, ensemble_model, etc.) in PyCaret displays a score grid 
# but it doesn’t return the score grid. Hence you cannot store the score grid in an object like pandas.DataFrame. 
# However, there is a function called pull that allows you to do that
daa = pull()
print(type(daa))
daa

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9264,0.8737,0.2764,0.6885,0.3938,0.3622,0.4057,5.303
gbc,Gradient Boosting Classifier,0.9261,0.8643,0.2367,0.7269,0.3564,0.3279,0.3873,209.196
lr,Logistic Regression,0.9257,0.8713,0.2741,0.6756,0.3896,0.3575,0.3994,0.796
rf,Random Forest Classifier,0.9237,0.8395,0.1621,0.7913,0.2687,0.2466,0.3363,91.164
ada,Ada Boost Classifier,0.9203,0.8464,0.2682,0.59,0.3682,0.332,0.3619,40.652


## F) Create Model

In [16]:
gbc = create_model('lightgbm')
gbc_results = pull()
gbc_results

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9246,0.8648,0.2392,0.6856,0.3547,0.3244,0.3759
1,0.9265,0.8841,0.2734,0.6909,0.3918,0.3603,0.4047
2,0.9283,0.8705,0.2788,0.7243,0.4026,0.3724,0.421
3,0.926,0.8727,0.2932,0.6653,0.407,0.3738,0.4098
4,0.9234,0.8728,0.2572,0.6441,0.3676,0.3347,0.3752
5,0.9263,0.8645,0.2729,0.6909,0.3912,0.3598,0.4043
6,0.9285,0.8718,0.2837,0.7248,0.4077,0.3774,0.4249
7,0.9279,0.8827,0.3268,0.6741,0.4401,0.4065,0.4372
8,0.9266,0.8721,0.2603,0.7108,0.3811,0.3509,0.4016
9,0.9258,0.8812,0.2788,0.6739,0.3944,0.3621,0.4025


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9246,0.8648,0.2392,0.6856,0.3547,0.3244,0.3759
1,0.9265,0.8841,0.2734,0.6909,0.3918,0.3603,0.4047
2,0.9283,0.8705,0.2788,0.7243,0.4026,0.3724,0.421
3,0.926,0.8727,0.2932,0.6653,0.407,0.3738,0.4098
4,0.9234,0.8728,0.2572,0.6441,0.3676,0.3347,0.3752
5,0.9263,0.8645,0.2729,0.6909,0.3912,0.3598,0.4043
6,0.9285,0.8718,0.2837,0.7248,0.4077,0.3774,0.4249
7,0.9279,0.8827,0.3268,0.6741,0.4401,0.4065,0.4372
8,0.9266,0.8721,0.2603,0.7108,0.3811,0.3509,0.4016
9,0.9258,0.8812,0.2788,0.6739,0.3944,0.3621,0.4025


## G) Tune Model

In [17]:
tuned_gbc = tune_model(gbc)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9268,0.8576,0.2608,0.7108,0.3816,0.3514,0.402
1,0.9252,0.8828,0.2644,0.6743,0.3798,0.348,0.3918
2,0.9294,0.8752,0.3004,0.7229,0.4244,0.3936,0.4371
3,0.9271,0.8799,0.2986,0.6803,0.415,0.3824,0.4196
4,0.9254,0.874,0.2734,0.6696,0.3883,0.3559,0.3969
5,0.9259,0.8635,0.2711,0.6833,0.3882,0.3565,0.4001
6,0.9279,0.8734,0.2873,0.708,0.4087,0.3775,0.4216
7,0.9277,0.8861,0.3321,0.6679,0.4436,0.4096,0.4384
8,0.9276,0.8691,0.2801,0.7091,0.4015,0.3706,0.4165
9,0.9263,0.8789,0.2734,0.6878,0.3912,0.3597,0.4036


In [18]:
print(tuned_gbc)

LGBMClassifier(bagging_fraction=0.6, bagging_freq=2, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=41, min_child_weight=0.001, min_split_gain=0.9,
               n_estimators=260, n_jobs=-1, num_leaves=70, objective=None,
               random_state=123, reg_alpha=2, reg_lambda=3, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


## H) Plot Model

In [26]:
from pycaret.utils import enable_colab
from pycaret.utils import version
enable_colab()
print(version())
plot_model(tuned_gbc)
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-dark')
import numpy as np

In [27]:
plot_model(tuned_gbc, 'confusion_matrix')

In [28]:
plot_model(tuned_gbc, 'feature')

In [29]:
plot_model(tuned_gbc, 'parameter')

Unnamed: 0,Parameters
boosting_type,gbdt
class_weight,
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1
min_child_samples,41
min_child_weight,0.001
min_split_gain,0.9
n_estimators,260


## I) Evaluate Model

In [30]:
%%time 
start_time = timer(None)
evaluate_model(tuned_gbc)
timer(start_time)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…


Time taken: 0 hours 0 minutes and 0.688 seconds.
CPU times: user 592 ms, sys: 7.72 ms, total: 600 ms
Wall time: 689 ms


## J) Interpret Model

In [33]:
# !pip install shap

Collecting shap
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[?25l[K     |▋                               | 10 kB 13.6 MB/s eta 0:00:01[K     |█▏                              | 20 kB 16.5 MB/s eta 0:00:01[K     |█▊                              | 30 kB 12.7 MB/s eta 0:00:01[K     |██▎                             | 40 kB 10.4 MB/s eta 0:00:01[K     |███                             | 51 kB 6.6 MB/s eta 0:00:01[K     |███▌                            | 61 kB 6.6 MB/s eta 0:00:01[K     |████                            | 71 kB 5.5 MB/s eta 0:00:01[K     |████▋                           | 81 kB 6.2 MB/s eta 0:00:01[K     |█████▏                          | 92 kB 5.9 MB/s eta 0:00:01[K     |█████▉                          | 102 kB 5.8 MB/s eta 0:00:01[K     |██████▍                         | 112 kB 5.8 MB/s eta 0:00:01[K     |███████                         | 122 kB 5.8 MB/s eta 0:00:01[K     |███████▌                        | 133 kB 5.8 MB/s 

In [34]:
import shap
interpret_model(tuned_gbc)

## K) Optimize Threshold

In [None]:
# %%time 
# start_time = timer(None)
# optimize_threshold(tuned_gbc, true_negative = 10, false_negative = -100)
# timer(start_time)

## L) Predict Model

In [35]:
%%time 
start_time = timer(None)
lr_predictions_holdout = predict_model(tuned_gbc)
timer(start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9253,0.8815,0.2833,0.6435,0.3934,0.3599,0.3947



Time taken: 0 hours 0 minutes and 2.037 seconds.
CPU times: user 3.59 s, sys: 11 ms, total: 3.61 s
Wall time: 2.04 s


## M) Finalize Model

In [36]:
finalize_model(tuned_gbc)



LGBMClassifier(bagging_fraction=0.6, bagging_freq=2, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=41, min_child_weight=0.001, min_split_gain=0.9,
               n_estimators=260, n_jobs=-1, num_leaves=70, objective=None,
               random_state=123, reg_alpha=2, reg_lambda=3, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## N) Save Model

In [37]:
#save trained model
save_model(tuned_gbc, 'tuned_gbc_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['ethnicity',
                                                             'gender',
                                                             'hospital_admit_source',
                                                             'icu_admit_source',
                                                             'icu_stay_type',
                                                             'icu_type',
                                                             'apache_3j_bodysystem',
                                                             'apache_2_bodysystem'],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='hospi

In [38]:
save_model(tuned_gbc, 'tuned_gbc_model.joblib')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['ethnicity',
                                                             'gender',
                                                             'hospital_admit_source',
                                                             'icu_admit_source',
                                                             'icu_stay_type',
                                                             'icu_type',
                                                             'apache_3j_bodysystem',
                                                             'apache_2_bodysystem'],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='hospi