# brain stroke prediction dataset

https://www.kaggle.com/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset


In [2]:
import pandas as pd
from pathlib import Path

def prepare_data():
    full_data = pd.read_csv(Path().resolve().joinpath("dataset/full_data.csv"))
    full_data[["hypertension", "heart_disease", "ever_married", "stroke"]] = full_data[
        ["hypertension", "heart_disease", "ever_married", "stroke"]
    ].replace(
        [1, 0], ["Yes", "No"]
    )  # prevent interfence with StringLookup layer

    full_data = full_data.drop_duplicates().dropna()

    data_shuffled = full_data.sample(frac=1).reset_index(drop=True)

    # create test dataset of 50 positive / 50 negative
    pos_rows = data_shuffled.loc[(full_data["stroke"] == "Yes")][:25]
    neg_rows = data_shuffled.loc[(full_data["stroke"] == "No")][:25]
    test_data = pd.concat([pos_rows, neg_rows])

    train_data = (
        pd.merge(data_shuffled, test_data, how="outer", indicator=True)
        .query('_merge=="left_only"')
        .drop("_merge", axis=1)
    )  # exclude test data from train dataset

    # balance training dataset
    train_pos = train_data[train_data["stroke"] == "Yes"]
    train_neg = train_data[train_data["stroke"] == "No"][:len(train_pos)]
    train_data = pd.concat([train_pos, train_neg])

    test_data.to_csv(Path().resolve().joinpath("dataset/test_data.csv"), index=False, header=False)
    train_data.to_csv(Path().resolve().joinpath("dataset/train_data.csv"), index=False, header=False)

    return test_data, train_data


def cleanup(data):
    return data.replace(
        {
            "Male": 1,
            "Female": 0,
            "Yes": 1,
            "No": 0,
            "Urban": 1,
            "Rural": 0,
            "Private": 0,
            "Self-employed": 1,
            "Govt_job": 2,
            "children": 3,
            "Unknown": 0,
            "never smoked": 0,
            "formerly smoked": 1,
            "smokes": 2,
        },
    ) 

In [3]:
# preproccess 

test_data, train_data = prepare_data()
data = cleanup(train_data)
test_data = cleanup(test_data)

x = data.loc[:, data.columns != 'stroke']  # strip labels
y = data["stroke"]  # labels

x_test = test_data.loc[:, test_data.columns != 'stroke']
y_test = test_data['stroke']

print(
    f"Using {len(train_data)} samples for training and {len(test_data)} for validation"
)

Using 490 samples for training and 50 for validation


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score


In [5]:
# RandomForest
rforest_clf = RandomForestClassifier()
rforest_clf.fit(x, y)

y_pred_rf = rforest_clf.predict(x_test)

In [6]:
report = classification_report(y_pred=y_pred_rf, y_true=y_test)
baccuracy = balanced_accuracy_score(y_pred=y_pred_rf, y_true=y_test)

print(report)
print(baccuracy)

              precision    recall  f1-score   support

           0       1.00      0.74      0.85        47
           1       0.20      1.00      0.33         3

    accuracy                           0.76        50
   macro avg       0.60      0.87      0.59        50
weighted avg       0.95      0.76      0.82        50

0.8723404255319149


In [7]:
# DecisionTree
dtree_clf = DecisionTreeClassifier()
dtree_clf.fit(x, y)

y_pred_dt = dtree_clf.predict(x_test)
#tree.plot_tree(dtree_clf)


In [11]:
report = classification_report(y_pred=y_pred_dt, y_true=y_test)
baccuracy = balanced_accuracy_score(y_pred=y_pred_dt, y_true=y_test)

print(report)
print(baccuracy)

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        47
           1       0.19      1.00      0.32         3

    accuracy                           0.74        50
   macro avg       0.59      0.86      0.58        50
weighted avg       0.95      0.74      0.81        50

0.8617021276595744
