In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import dalex as dx
from IPython.display import Image

from utils_features_selection import data_preprocess, col_miss  # unchanged functions

In [2]:
# simplified data_read_and_split() from utils_features_selection.py
def data_read():
    # copied part
    data_df_unna, data_pre_df = data_preprocess()
    col_miss_data = col_miss(data_df_unna)
    col_miss_data['Missing_part'] = col_miss_data['missing_count'] / len(data_df_unna)
    sel_cols = col_miss_data[col_miss_data['Missing_part'] <= 0.2]['col']
    data_df_sel = data_df_unna[sel_cols].copy()
    
    data_raw = data_df_sel.fillna(-1)
    # copied part
    
    # simplified name_dict from features_selection() in Main_of_features_selection.py
    # ADDED Age AND Target
    name_dict = {'乳酸脱氢酶': 'LDH', '淋巴细胞(%)': 'Lymphocytes(%)', '超敏C反应蛋白': 'hs-CRP',
                 '年龄': 'Age', 'Type2': 'Target'}
    data_raw.rename(columns=name_dict, inplace=True)
    data_train = data_raw.loc[:, name_dict.values()]

    return data_train

In [3]:
def data_exploration():
    data = data_read()
    # Change Target [0.0, 1.0] to [0, 1]
    data.Target = data.Target.astype(int)
    # Remove rows with NA for clear plot - 6.8% of samples
    # plot_data = data[~data.apply(lambda x: any(x.isna()), axis=1)]
    # print(data.apply(lambda x: any(x < 0), axis=1).sum() / plot_data.shape[0])
    # plot
    colors = ["#4378bf", "#f05a71"]
    sns.set_palette(sns.color_palette(colors))
    g = sns.pairplot(data, hue='Target', corner=True)
    g._legend.set_bbox_to_anchor((0.75, 0.75))
    g.fig.set_size_inches(10.5, 10)
    plt.savefig('data_exploration.png', bbox_inches='tight', dpi=100)
    plt.close()

In [4]:
data_exploration()
Image(url= "data_exploration.png")

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


In [5]:
def ale():
    # make a model (no need of spliting since we don't measure performance)
    data = data_read()
    X, y = data.drop('Target', axis=1), data.Target
    model = xgb.XGBClassifier(
        max_depth=4
        , learning_rate=0.2
        , reg_lambda=1
        , n_estimators=150
        , subsample=0.9
        , colsample_bytree=0.9,
        random_state=0)
    model.fit(X, y)
    # make an explanation
    explainer = dx.Explainer(model, X, y, label="Xgboost COVID19", verbose=False)
    pdp = explainer.model_profile(type='accumulated', N=None, verbose=False, center=False, random_state=0)
    # plot
    g = pdp.plot(vertical_spacing=0.08, title="Accumulated Local Effects",
                 y_title="ALE of Target", show=False)
    g.update_layout(margin=dict(t=55, b=10, r=15, l=70), height=1000, width=1050)
    g.write_image("ale.png")

In [6]:
ale()
Image(url= "ale.png")

In [7]:
def benchmark():
    from lazypredict.Supervised import LazyClassifier
    from sklearn.model_selection import train_test_split
    data = data_read()
    X, y = data.drop('Target', axis=1), data.Target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    return models

In [8]:
benchmark()  # easy task

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 28.93it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,1.0,1.0,1.0,1.0,0.03
XGBClassifier,1.0,1.0,1.0,1.0,0.03
RandomForestClassifier,1.0,1.0,1.0,1.0,0.16
ExtraTreesClassifier,0.99,0.99,0.99,0.99,0.15
SVC,0.99,0.99,0.99,0.99,0.01
KNeighborsClassifier,0.99,0.99,0.99,0.99,0.03
GaussianNB,0.99,0.99,0.99,0.99,0.02
AdaBoostClassifier,0.99,0.99,0.99,0.99,0.09
QuadraticDiscriminantAnalysis,0.98,0.98,0.98,0.98,0.02
NuSVC,0.97,0.97,0.97,0.97,0.02


In [9]:
def save_explainer():
    # make a model (no need of spliting since we don't measure performance)
    data = data_read()
    X, y = data.drop('Target', axis=1), data.Target
    model = xgb.XGBClassifier(
        max_depth=4
        , learning_rate=0.2
        , reg_lambda=1
        , n_estimators=150
        , subsample=0.9
        , colsample_bytree=0.9,
        random_state=0)
    model.fit(X, y)
    # save an Explainer
    explainer = dx.Explainer(model, X, y, label="Xgboost COVID19", verbose=False)
    explainer.dump(open('xgboost-explainer.pkl', 'wb'))

In [10]:
save_explainer()  # make modelStudio in modelStudio-dashboard.R

  -> Residual function is local, thus has to be dropped.


In [11]:
Image(url= "model_explanation.png")  # screenshot of the modelStudio output