# brain stroke prediction dataset

https://www.kaggle.com/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset


In [100]:
import pandas as pd
from pandas import DataFrame, Series
from pathlib import Path
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample, shuffle
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import OrdinalEncoder


In [101]:

def split_label(data: DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    data_categorical = data[categorical_features]

    enc = OrdinalEncoder()
    enc.fit(data_categorical)

    data[categorical_features] = DataFrame(enc.fit_transform(data_categorical))

    return data


def load_data(train_file, validation_file, test_file, headers, categorical_features):
    train_df = pd.read_csv(train_file, names=headers)
    validation_df = pd.read_csv(validation_file, names=headers)
    test_df = pd.read_csv(test_file, names=headers)

    encoded = [encode_int(df, categorical_features) for df in [train_df, validation_df, test_df]]

    return encoded[0], encoded[1], encoded[2]


In [102]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]

CATEGORICAL_FEATURES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

CONTINUOUS_FEATURES = [
    "age", 
    "avg_glucose_level",
    "bmi"
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

train_data, validation_data, test_data = load_data(
    train_file=train_data_file,
    validation_file=validation_data_file,
    test_file=test_data_file,
    headers=CSV_HEADER,
    categorical_features=CATEGORICAL_FEATURES,
)

x, y = split_label(train_data)
x_valid, y_valid = split_label(validation_data)
x_test, y_test = split_label(test_data)

print(
    f"Using {len(train_data)} samples for training, {len(validation_data)} for validation, and {len(test_data)} for test"
)

train_data

Using 6468 samples for training, 500 for validation, and 481 for test


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0.0,41.000000,0.0,1.0,0.0,1.0,0.0,186.540000,39.000000,0.0,0.0
1,0.0,27.000000,0.0,0.0,0.0,1.0,0.0,75.040000,24.500000,1.0,0.0
2,0.0,36.000000,0.0,0.0,0.0,1.0,1.0,216.960000,34.500000,1.0,0.0
3,0.0,62.000000,1.0,0.0,0.0,1.0,0.0,77.040000,33.800000,0.0,0.0
4,0.0,42.000000,0.0,0.0,0.0,1.0,0.0,139.770000,27.700000,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6463,0.0,50.638327,0.0,0.0,1.0,1.0,1.0,88.050509,28.336707,1.0,1.0
6464,0.0,58.914802,0.0,0.0,1.0,1.0,1.0,110.717952,32.562306,0.0,1.0
6465,1.0,60.669342,0.0,0.0,1.0,2.0,0.0,116.010097,33.596950,1.0,1.0
6466,0.0,76.129762,0.0,0.0,1.0,2.0,0.0,106.608535,28.368690,0.0,1.0


In [103]:
# RandomForest
rforest_clf = RandomForestClassifier()
rforest_clf.fit(x, y)

y_pred_rf = rforest_clf.predict(x_test)
auc = roc_auc_score(y_score=rforest_clf.predict_proba(x_test)[:, 1], y_true=y_test)


In [104]:
report = classification_report(y_pred=y_pred_rf, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_rf, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")
print(f"auc: {auc}")


              precision    recall  f1-score   support

         0.0       0.97      0.85      0.91       452
         1.0       0.22      0.66      0.32        29

    accuracy                           0.84       481
   macro avg       0.60      0.75      0.62       481
weighted avg       0.93      0.84      0.87       481

tn: 383 fp:69 fn: 10 tp: 19 
auc: 0.8365120537076595


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 1.00   | 0.97      | 352     |
| 1            | 0.00      | 0.00   | 0.00      | 29      |
| accuracy     |           |        | 0.94      | 481     |
| macro avg    | 0.47      | 0.50   | 0.48      | 481     |
| weighted avg | 0.88      | 0.94   | 0.91      | 481     |

tn: 451 fp:1 fn: 29 tp: 0

#### After Resampling


In [105]:
# DecisionTree
dtree_clf = DecisionTreeClassifier()
dtree_clf.fit(x, y)

y_pred_dt = dtree_clf.predict(x_test)
auc = roc_auc_score(y_score=dtree_clf.predict_proba(x_test)[:, 1], y_true=y_test)


In [106]:
report = classification_report(y_pred=y_pred_dt, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_dt, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")
print(f"auc: {auc}")


              precision    recall  f1-score   support

         0.0       0.97      0.85      0.91       452
         1.0       0.20      0.55      0.29        29

    accuracy                           0.84       481
   macro avg       0.58      0.70      0.60       481
weighted avg       0.92      0.84      0.87       481

tn: 386 fp:66 fn: 13 tp: 16 
auc: 0.7028532194079951


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 0.95   | 0.94      | 452     |
| 1            | 0.11      | 0.10   | 0.11      | 29      |
| accuracy     |           |        | 0.90      | 481     |
| macro avg    | 0.47      | 0.50   | 0.53      | 481     |
| weighted avg | 0.89      | 0.90   | 0.89      | 481     |

tn: 451 fp:1 fn: 29 tp: 0

#### After Resampling


In [107]:
import xgboost as xgb

# read in data
dtrain = xgb.DMatrix(data=x, label=y)
dvalid = xgb.DMatrix(data=x_valid, label=y_valid)
dtest = xgb.DMatrix(data=x_test, label=y_test)

# specify parameters via map
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
num_round = 10

bst = xgb.train(
    param,
    dtrain,
    num_round,
    evals=[(dvalid, "eval"), (dtrain, "train")],
    early_stopping_rounds=3,
)

# make prediction
y_pred_xgb = bst.predict(dtest)
auc = roc_auc_score(y_score=y_pred_xgb, y_true=y_test)


[0]	eval-logloss:0.67396	train-logloss:0.38434
[1]	eval-logloss:0.68269	train-logloss:0.33901
[2]	eval-logloss:0.69842	train-logloss:0.31090
[3]	eval-logloss:0.62332	train-logloss:0.28768
[4]	eval-logloss:0.64099	train-logloss:0.27312
[5]	eval-logloss:0.65499	train-logloss:0.25382
[6]	eval-logloss:0.63824	train-logloss:0.24247
[7]	eval-logloss:0.63296	train-logloss:0.23644
[8]	eval-logloss:0.63944	train-logloss:0.22906
[9]	eval-logloss:0.62221	train-logloss:0.22339


In [108]:
y_pred_xgb = np.round(y_pred_xgb)

report = classification_report(y_pred=y_pred_xgb, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_xgb, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")
print(f"auc: {auc}")


              precision    recall  f1-score   support

         0.0       0.98      0.75      0.85       452
         1.0       0.16      0.76      0.27        29

    accuracy                           0.75       481
   macro avg       0.57      0.76      0.56       481
weighted avg       0.93      0.75      0.82       481

tn: 340 fp:112 fn: 7 tp: 22 
auc: 0.844598718339945
