In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

In [2]:
data = pd.read_csv('../data/healthcare-dataset-stroke-data.csv')

In [3]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data.shape

(5110, 12)

In [5]:
data.drop(data[data.gender == 'Other'].index, inplace=True)
data.drop(['id', 'avg_glucose_level'], axis=1, inplace=True)

In [6]:
# # Shuffle the date to see if this changes things: 
# # 
# data = data.sample(frac=1, random_state=42).reset_index(drop=True)
# # data = data.sample(frac=1, random_state=42)

# print(data.describe())
# print(data.head())

# data.stroke.plot(kind='line')

In [7]:
#data = data[data['stroke']==1]

Y = data[['stroke']]
X = data.drop('stroke', axis=1)

Y.head(25)

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [8]:
model = RandomForestClassifier()

categorical_cols = data.select_dtypes(include=[object]).columns

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop = 'if_binary'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough')

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

In [9]:
# X = X[:100]
# Y = Y[:100]

my_pipeline.fit(X, Y)

  return fit_method(estimator, *args, **kwargs)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:
predictions = my_pipeline.predict(X)
print(precision_score(Y, predictions))
print(recall_score(Y, predictions))

1.0
0.9919678714859438


In [13]:
# Mann, Mann, Mann, I cannot believe this! ;-)


# Try evaluation: 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# # Initialize and train the RandomForestClassifier
# clf = RandomForestClassifier(random_state=42)
# clf.fit(X_train, y_train)

my_pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions = my_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.9393346379647749
Confusion Matrix:
[[1440    4]
 [  89    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1444
           1       0.00      0.00      0.00        89

    accuracy                           0.94      1533
   macro avg       0.47      0.50      0.48      1533
weighted avg       0.89      0.94      0.91      1533



In [None]:
# Test predictions for small subset where stroke==1:
#stroketrue = pd.where(Y['stroke']==1)

#print(stroketrue)

print(Y)
print(Y['stroke']==1)

stroketrue = np.where(Y['stroke']==1)
print(stroketrue)

#X_stroketrue = X[stroketrue]
#X_stroketrue.shape

# predictions = my_pipeline.predict(X_stroketrue)

#print(precision_score(Y[stroketrue], predictions))
#print(recall_score(Y[stroketrue], predictions))


      stroke
0          0
1          0
2          0
3          0
4          0
...      ...
5104       0
5105       0
5106       0
5107       0
5108       0

[5109 rows x 1 columns]
0       False
1       False
2       False
3       False
4       False
        ...  
5104    False
5105    False
5106    False
5107    False
5108    False
Name: stroke, Length: 5109, dtype: bool
(array([  15,   52,   57,   96,  115,  128,  134,  148,  153,  159,  174,
        192,  203,  213,  229,  242,  243,  244,  259,  276,  286,  303,
        341,  375,  385,  386,  421,  451,  516,  519,  520,  535,  539,
        550,  560,  584,  590,  591,  602,  604,  612,  615,  682,  721,
        736,  754,  760,  774,  786,  797,  813,  820,  825,  830,  842,
        893,  901,  935,  940,  979, 1006, 1010, 1044, 1049, 1059, 1083,
       1102, 1117, 1132, 1144, 1155, 1167, 1175, 1189, 1198, 1201, 1214,
       1235, 1271, 1276, 1338, 1341, 1366, 1390, 1417, 1429, 1435, 1513,
       1518, 1545, 1570, 1586, 1612, 162

In [None]:
my_pipeline.steps[1][1].feature_importances_

array([4.33234960e-02, 2.13855371e-02, 1.45241855e-02, 1.16913857e-05,
       2.00347583e-02, 1.73963168e-02, 1.45016161e-03, 4.53104344e-02,
       1.73873872e-02, 1.81306573e-02, 2.08934027e-02, 2.04425521e-02,
       3.16718932e-01, 3.21534059e-02, 2.88269033e-02, 3.82010179e-01])

In [None]:
my_pipeline[:-1].get_feature_names_out()

array(['cat__gender_Male', 'cat__ever_married_Yes',
       'cat__work_type_Govt_job', 'cat__work_type_Never_worked',
       'cat__work_type_Private', 'cat__work_type_Self-employed',
       'cat__work_type_children', 'cat__Residence_type_Urban',
       'cat__smoking_status_Unknown',
       'cat__smoking_status_formerly smoked',
       'cat__smoking_status_never smoked', 'cat__smoking_status_smokes',
       'remainder__age', 'remainder__hypertension',
       'remainder__heart_disease', 'remainder__bmi'], dtype=object)

In [None]:
# just an attempt on the two most important features
Xtrain = X[['age','bmi']]
model.fit(Xtrain, Y)
print(precision_score(Y, model.predict(Xtrain)))
print(recall_score(Y, model.predict(Xtrain)))

  return fit_method(estimator, *args, **kwargs)


0.9014778325123153
0.7349397590361446


In [None]:
single_case = pd.DataFrame(['Male', 40.0, 0, 0, 1, 'Self-employed', 'Urban', 16, 'formerly smoked']).T
single_case.columns = X.columns
single_case = single_case.astype({'age': float, 'hypertension': int, 'heart_disease': int, 'bmi': float})
#single_case.hypertension.astype(int)
#single_case.heart_disease.astype(int)
#single_case.bmi.astype(float)
single_case.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          1 non-null      object 
 1   age             1 non-null      float64
 2   hypertension    1 non-null      int64  
 3   heart_disease   1 non-null      int64  
 4   ever_married    1 non-null      object 
 5   work_type       1 non-null      object 
 6   Residence_type  1 non-null      object 
 7   bmi             1 non-null      float64
 8   smoking_status  1 non-null      object 
dtypes: float64(2), int64(2), object(5)
memory usage: 204.0+ bytes


In [None]:
my_pipeline.predict(single_case)

ValueError: Found unknown categories [1] in column 1 during transform

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5109 entries, 0 to 5109
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          5109 non-null   object 
 1   age             5109 non-null   float64
 2   hypertension    5109 non-null   int64  
 3   heart_disease   5109 non-null   int64  
 4   ever_married    5109 non-null   object 
 5   work_type       5109 non-null   object 
 6   Residence_type  5109 non-null   object 
 7   bmi             4908 non-null   float64
 8   smoking_status  5109 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 528.2+ KB
