In [1]:
# Imports
import numpy as np
import pandas as pd
from utils import DataLoader
from interpret.glassbox import LogisticRegression, ClassificationTree, ExplainableBoostingClassifier
from interpret import show
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Load and preprocess data
data_loader = DataLoader()
data_loader.load_dataset()
data_loader.preprocess_data()

# Split the data for evaluation
X_train, X_test, y_train, y_test = data_loader.get_data_split()
print(X_train.shape)
print(X_test.shape)

(4088, 21)
(1022, 21)


In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4088 entries, 5041 to 1140
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender_Female                   4088 non-null   int64  
 1   gender_Male                     4088 non-null   int64  
 2   gender_Other                    4088 non-null   int64  
 3   ever_married_No                 4088 non-null   int64  
 4   ever_married_Yes                4088 non-null   int64  
 5   work_type_Govt_job              4088 non-null   int64  
 6   work_type_Never_worked          4088 non-null   int64  
 7   work_type_Private               4088 non-null   int64  
 8   work_type_Self-employed         4088 non-null   int64  
 9   work_type_children              4088 non-null   int64  
 10  Residence_type_Rural            4088 non-null   int64  
 11  Residence_type_Urban            4088 non-null   int64  
 12  smoking_status_Unknown          4088

In [4]:
# Check for class imbalance
print(np.sum(y_train == 1))
print(np.sum(y_train == 0))

199
3889


In [5]:
# Oversample the train data
X_train, y_train = data_loader.oversample(X_train, y_train)
print("After oversampling: ", X_train.shape)
print(np.sum(y_train == 1))
print(np.sum(y_train == 0))

Index(['gender_Female', 'gender_Male', 'gender_Other', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes', 'age',
       'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'],
      dtype='object')
Index(['gender_Female', 'gender_Male', 'gender_Other', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes', 'age',
       'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'],
    

In [7]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7778 entries, 0 to 7777
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender_Female                   7778 non-null   int64  
 1   gender_Male                     7778 non-null   int64  
 2   gender_Other                    7778 non-null   int64  
 3   ever_married_No                 7778 non-null   int64  
 4   ever_married_Yes                7778 non-null   int64  
 5   work_type_Govt_job              7778 non-null   int64  
 6   work_type_Never_worked          7778 non-null   int64  
 7   work_type_Private               7778 non-null   int64  
 8   work_type_Self-employed         7778 non-null   int64  
 9   work_type_children              7778 non-null   int64  
 10  Residence_type_Rural            7778 non-null   int64  
 11  Residence_type_Urban            7778 non-null   int64  
 12  smoking_status_Unknown          77

In [8]:
# Fit Logistic Regression model
lr = LogisticRegression(random_state=2021, feature_names=X_train.columns, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
print("Training finished.")

Training finished.


In [9]:
# Evaluate logistic regression modle
y_pred = lr.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5181145141312915
Accuracy 0.7377690802348337


In [10]:
# Explain local prediction
lr_local = lr.explain_local(X_test[:100], y_test[:100], name='Logistic Regression')
show(lr_local)

In [12]:
lr_global = lr.explain_global(name="Logistic Regression")
show(lr_global)

In [13]:
# Fit decision tree model
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training Finished.")

Training Finished.


In [14]:
y_pred = tree.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5259920468974536
Accuracy 0.761252446183953


In [22]:
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='tree')
show(tree_local)

In [26]:
tree_global = tree.explain_global(name="Tree")
show(tree_global)

In [24]:
ebm = ExplainableBoostingClassifier(random_state=2021)
ebm.fit(X_train, y_train)
print("Training Finished.")

Training Finished.


In [25]:
y_pred = ebm.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5583304901626294
Accuracy 0.9256360078277887


In [29]:
ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM')
show(ebm_local)

In [31]:
ebm_global = ebm.explain_global(name="EBM")
show(ebm_global)