# brain stroke prediction dataset

https://www.kaggle.com/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset


In [129]:
import pandas as pd
from pandas import DataFrame, Series
from pathlib import Path
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample, shuffle

from sklearn.preprocessing import OrdinalEncoder


In [130]:
def load_data(train_file, validation_file, test_file, headers):
    train_df = pd.read_csv(train_file, names=headers)
    validation_df = pd.read_csv(validation_file, names=headers)
    test_df = pd.read_csv(test_file, names=headers)

    return test_df, validation_df, train_df

def split_label(data: DataFrame):
    x = data.copy().drop('stroke', axis=1)
    y = data["stroke"]  # labels

    return x, y

def encode_int(data):
    return data.replace(
        {
            "Male": 1,
            "Female": 0,
            "Yes": 1,
            "No": 0,
            "Urban": 1,
            "Rural": 0,
            "Private": 0,
            "Self-employed": 1,
            "Govt_job": 2,
            "children": 3,
            "Unknown": np.nan,
            "never smoked": 0,
            "formerly smoked": 1,
            "smokes": 2,
        },
    ) 

def encode_int_ordinal(data):
    enc = OrdinalEncoder()
    enc.fit(data)
    
    return DataFrame(enc.transform(data), columns=data.columns)

In [131]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "Residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

test_data, validation_data, train_data = load_data(
    train_file=train_data_file, validation_file=validation_data_file, test_file=test_data_file, headers=CSV_HEADER
)

test_data = encode_int_ordinal(test_data)
validation_data = encode_int_ordinal(validation_data)
train_data = encode_int_ordinal(train_data)

# concat validation and training data for now
train_data = pd.concat([train_data, validation_data])
validation_data = DataFrame()

x, y = split_label(train_data)
#x_valid, y_valid = split_label(validation_data)
x_test, y_test = split_label(test_data)

print(
    f"Using {len(train_data)} samples for training, {len(validation_data)} for validation, and {len(test_data)} for test"
)


Using 6470 samples for training, 0 for validation, and 481 for test


In [132]:
# RandomForest
rforest_clf = RandomForestClassifier()
rforest_clf.fit(x, y)

y_pred_rf = rforest_clf.predict(x_test)

In [133]:
report = classification_report(y_pred=y_pred_rf, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_rf, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")

              precision    recall  f1-score   support

         0.0       0.94      0.97      0.96       452
         1.0       0.20      0.10      0.14        29

    accuracy                           0.92       481
   macro avg       0.57      0.54      0.55       481
weighted avg       0.90      0.92      0.91       481

tn: 440 fp:12 fn: 26 tp: 3 


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 1.00   | 0.97      | 352     |
| 1            | 0.00      | 0.00   | 0.00      | 29      |
| accuracy     |           |        | 0.94      | 481     |
| macro avg    | 0.47      | 0.50   | 0.48      | 481     |
| weighted avg | 0.88      | 0.94   | 0.91      | 481     |

tn: 451 fp:1 fn: 29 tp: 0 


#### After Resampling


In [134]:
# DecisionTree
dtree_clf = DecisionTreeClassifier()
dtree_clf.fit(x, y)

y_pred_dt = dtree_clf.predict(x_test)


In [135]:
report = classification_report(y_pred=y_pred_dt, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_dt, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")


              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94       452
         1.0       0.19      0.24      0.21        29

    accuracy                           0.89       481
   macro avg       0.57      0.59      0.58       481
weighted avg       0.90      0.89      0.90       481

tn: 422 fp:30 fn: 22 tp: 7 


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 0.95   | 0.94      | 452     |
| 1            | 0.11      | 0.10   | 0.11      | 29      |
| accuracy     |           |        | 0.90      | 481     |
| macro avg    | 0.47      | 0.50   | 0.53      | 481     |
| weighted avg | 0.89      | 0.90   | 0.89      | 481     |

tn: 451 fp:1 fn: 29 tp: 0 

#### After Resampling


In [136]:
import xgboost as xgb

# read in data
dtrain = xgb.DMatrix(data=x, label=y)
dtest = xgb.DMatrix(data=x_test, label=y_test)

xgb.XGBClassifier

# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 10
bst = xgb.train(param, dtrain, num_round)

# make prediction
y_pred_xgb = bst.predict(dtest)

y_pred_xgb

array([3.24601610e-03, 1.47611015e-02, 4.32655262e-03, 4.60040063e-01,
       1.88531149e-02, 5.62408846e-03, 1.26156211e-02, 3.79655510e-02,
       7.17055053e-02, 3.84556800e-01, 2.45140679e-03, 1.43669592e-02,
       1.18708995e-03, 1.06191123e-03, 3.21757719e-02, 2.45140679e-03,
       1.57252373e-03, 2.05225959e-01, 1.24402368e-03, 1.06191123e-03,
       4.60634530e-01, 4.72987115e-01, 5.75710125e-02, 1.95654668e-03,
       8.39988352e-04, 1.18708995e-03, 8.31278227e-03, 1.13946795e-01,
       8.44849288e-01, 9.29742157e-02, 9.52079426e-03, 6.02040789e-04,
       2.79951021e-02, 2.47273128e-03, 1.84201360e-01, 5.14716983e-01,
       6.02040789e-04, 7.13593423e-01, 7.57074282e-02, 9.52876285e-02,
       5.17838076e-02, 1.06191123e-03, 1.81760132e-01, 2.53762245e-01,
       4.06424486e-04, 2.89317444e-02, 1.01197608e-01, 5.12269318e-01,
       2.01395322e-02, 2.19319831e-03, 3.83825670e-03, 7.99701214e-02,
       2.27952772e-03, 8.27586651e-01, 1.76128349e-03, 7.78734922e-01,
      

In [137]:
y_pred_xgb = np.round(y_pred_xgb)

report = classification_report(y_pred=y_pred_xgb, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_xgb, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")

              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94       452
         1.0       0.23      0.31      0.26        29

    accuracy                           0.89       481
   macro avg       0.59      0.62      0.60       481
weighted avg       0.91      0.89      0.90       481

tn: 421 fp:31 fn: 20 tp: 9 
