In [None]:
from utils import DataLoader
from interpret.glassbox import (LogisticRegression,
                                ClassificationTree, 
                                ExplainableBoostingClassifier)
from interpret import show
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, f1_score



In [51]:
data = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
data.columns = data.columns.str.strip()


In [52]:
print(data.columns.tolist())


['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']


In [None]:
print(data.dtypes)
print(data.nunique())   


id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64


In [54]:
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [56]:
data.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'gender_Male', 'gender_Other', 'ever_married_Yes',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')

In [57]:
print(set(categorical_cols) - set(data.columns))


{'work_type', 'smoking_status', 'gender', 'Residence_type', 'ever_married'}


In [None]:
# Assuming data is your DataFrame after one-hot encoding and dropping 'id'

# Fill missing bmi values
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

# Drop 'id' column if still present
data = data.drop(columns=['id'], errors='ignore')

# Split features and target
X = data.drop('stroke', axis=1)
y = data['stroke']

# Convert boolean columns to integers
bool_cols = X.select_dtypes(include=['bool']).columns
X[bool_cols] = X[bool_cols].astype(int)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample minority class
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(f"Before oversampling: {X_train.shape}, {y_train.shape}")
print(f"After oversampling: {X_train_res.shape}, {y_train_res.shape}")




Before oversampling: (4088, 16), (4088,)
After oversampling: (7802, 16), (7802,)
Training finished.


In [66]:
# Train model
from interpret.glassbox import LogisticRegression
lr = LogisticRegression(random_state=2021, feature_names=X_train_res.columns, penalty='l1', solver='liblinear')
lr.fit(X_train_res, y_train_res)
print("Training finished.")
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Training finished.
Accuracy: 0.7416829745596869
F1 Score: 0.27472527472527475


In [67]:
lr_local = lr.explain_local(X_test, y_test)
show(lr_local)

In [68]:
lr_global = lr.explain_global(name="Logistic Regression Global Explanation")
show(lr_global)

In [69]:
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training finished.")
y_pred = tree.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")


Training finished.
F1 Score 0.4843592330978809
Accuracy 0.9393346379647749


In [70]:
tree_local = tree.explain_local(X_test, y_test)
show(tree_local)


In [72]:
tree_golbal = tree.explain_global(name="Classification Tree Global explanationssss")
show(tree_golbal)


In [73]:
ebm = ExplainableBoostingClassifier(random_state=2222)
ebm.fit(X_train, y_train)
print("Training finished.")
y_pred = ebm.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")



Training finished.
F1 Score 0.4843592330978809
Accuracy 0.9393346379647749


In [None]:
ebm_local = ebm.explain_local(X_test, y_test,name="EBM Local Explanation")
show(ebm_local)

In [76]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)