# brain stroke prediction dataset

https://www.kaggle.com/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset


In [65]:
import pandas as pd
from pandas import DataFrame, Series
from pathlib import Path
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample, shuffle

from sklearn.preprocessing import OrdinalEncoder


In [66]:
def load_data(train_file, test_file, headers):
    train_df = pd.read_csv(train_file, names=headers)
    test_df = pd.read_csv(test_file, names=headers)

    return test_df, train_df


def encode_int(data):
    return data.replace(
        {
            "Male": 1,
            "Female": 0,
            "Yes": 1,
            "No": 0,
            "Urban": 1,
            "Rural": 0,
            "Private": 0,
            "Self-employed": 1,
            "Govt_job": 2,
            "children": 3,
            "Unknown": np.nan,
            "never smoked": 0,
            "formerly smoked": 1,
            "smokes": 2,
        },
    ) 

def encode_int_ordinal(data):
    enc = OrdinalEncoder()
    enc.fit(data)
    
    return DataFrame(enc.transform(data), columns=data.columns)

In [67]:
from imblearn.combine import SMOTEENN

def resample(data: pd.DataFrame):
    """ oversample positive cases with SMOTE and undersample negative with EEN """
    #minority_df = data[data['stroke'] == 1]
    #majority_df = data[data['stroke'] == 0]

    X = data.drop(columns=['stroke'], axis=1)
    Y = data['stroke']

    smote_een = SMOTEENN(random_state=0, sampling_strategy='auto')

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)  # type: ignore
    x_resampled['stroke'] = y_resampled

    return x_resampled


In [85]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "Residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
test_data_file = str(test_data_path.absolute())

test_data, train_data = load_data(
    train_file=train_data_file, test_file=test_data_file, headers=CSV_HEADER
)

test_data = encode_int_ordinal(test_data)
train_data = encode_int_ordinal(train_data)

x = train_data.copy().drop('stroke', axis=1)
y = train_data["stroke"]  # labels

x_test = test_data.copy().drop('stroke', axis=1)
y_test = test_data["stroke"]

print(
    f"Using {len(train_data)} samples for training and {len(test_data)} for validation"
)


Using 6625 samples for training and 481 for validation


In [72]:
# RandomForest
rforest_clf = RandomForestClassifier()
rforest_clf.fit(x, y)

y_pred_rf = rforest_clf.predict(x_test)

In [90]:
report = classification_report(y_pred=y_pred_rf, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_rf, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")

              precision    recall  f1-score   support

         0.0       0.95      0.76      0.85       452
         1.0       0.09      0.38      0.15        29

    accuracy                           0.74       481
   macro avg       0.52      0.57      0.50       481
weighted avg       0.90      0.74      0.80       481

tn: 345 fp:107 fn: 18 tp: 11 


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 1.00   | 0.97      | 352     |
| 1            | 0.00      | 0.00   | 0.00      | 29      |
| accuracy     |           |        | 0.94      | 481     |
| macro avg    | 0.47      | 0.50   | 0.48      | 481     |
| weighted avg | 0.88      | 0.94   | 0.91      | 481     |

tn: 451 fp:1 fn: 29 tp: 0 


#### After Resampling


In [112]:
# DecisionTree
dtree_clf = DecisionTreeClassifier()
dtree_clf.fit(x, y)

y_pred_dt = dtree_clf.predict(x_test)


In [111]:
report = classification_report(y_pred=y_pred_dt, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_dt, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")


              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94       452
         1.0       0.18      0.21      0.19        29

    accuracy                           0.89       481
   macro avg       0.56      0.57      0.57       481
weighted avg       0.90      0.89      0.90       481

tn: 424 fp:28 fn: 23 tp: 6 


#### Before Resampling

classification_report:

| label        | precision | recall | f-1 score | support |
| ------------ | --------- | ------ | --------- | ------- |
| 0            | 0.94      | 0.95   | 0.94      | 452     |
| 1            | 0.11      | 0.10   | 0.11      | 29      |
| accuracy     |           |        | 0.90      | 481     |
| macro avg    | 0.47      | 0.50   | 0.53      | 481     |
| weighted avg | 0.89      | 0.90   | 0.89      | 481     |

tn: 451 fp:1 fn: 29 tp: 0 

#### After Resampling


In [113]:
import xgboost as xgb

# read in data
dtrain = xgb.DMatrix(data=x, label=y)
dtest = xgb.DMatrix(data=x_test, label=y_test)

xgb.XGBClassifier

# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 10
bst = xgb.train(param, dtrain, num_round)

# make prediction
y_pred_xgb = bst.predict(dtest)

y_pred_xgb

array([9.16132983e-03, 2.56651212e-02, 1.18807200e-02, 1.77822560e-01,
       5.38735054e-02, 7.51902023e-03, 1.10525859e-03, 6.17070049e-02,
       2.81591237e-01, 1.48238853e-01, 7.40195950e-03, 1.13786412e-02,
       7.40195950e-03, 3.03242006e-03, 1.68142300e-02, 5.18289581e-02,
       3.45356733e-04, 6.62667394e-01, 6.08515402e-04, 3.03242006e-03,
       5.65594256e-01, 8.56928229e-02, 3.54468487e-02, 3.46717285e-03,
       4.57183429e-04, 7.40195950e-03, 1.65244564e-02, 2.58539677e-01,
       3.60054851e-01, 2.09511623e-01, 5.58384098e-02, 5.35045401e-04,
       2.75469050e-02, 5.00949286e-03, 3.89814258e-01, 3.51326287e-01,
       6.08515402e-04, 6.54939890e-01, 6.68293312e-02, 2.02352136e-01,
       2.51868844e-01, 9.46639979e-04, 3.15161049e-01, 5.13294376e-02,
       1.25259132e-04, 1.13786412e-02, 2.23570783e-02, 4.21892703e-01,
       7.70949125e-02, 9.46639979e-04, 7.21146446e-03, 1.82904616e-01,
       5.57057466e-03, 1.82879776e-01, 8.97963904e-03, 4.52158600e-01,
      

In [110]:
y_pred_xgb = np.round(y_pred_xgb)

report = classification_report(y_pred=y_pred_xgb, y_true=y_test)
tn, fp, fn, tp = confusion_matrix(y_pred=y_pred_xgb, y_true=y_test).ravel()

print(report)
print(f"tn: {tn} fp:{fp} fn: {fn} tp: {tp} ")

              precision    recall  f1-score   support

         0.0       0.95      0.92      0.94       452
         1.0       0.20      0.31      0.24        29

    accuracy                           0.88       481
   macro avg       0.58      0.62      0.59       481
weighted avg       0.91      0.88      0.90       481

tn: 416 fp:36 fn: 20 tp: 9 
