# Setup

In [16]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [113]:
data = pd.read_csv("/Users/jmadu1/Documents/healthcare-dataset-stroke-data.csv")
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [18]:
#declare X and y
X = data.drop(columns='stroke')
y = data['stroke']

In [19]:
data.rename(columns={"hypertension": "ATP2B1"})

Unnamed: 0,id,gender,age,ATP2B1,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Check for duplicates, null values and imbalances in target

In [20]:
print(f'The sum of all duplicated data is {data.duplicated().sum()}. \n')

print(f'Total null values per feature: \n {data.isnull().sum().sort_values(ascending=False)}')


The sum of all duplicated data is 0. 

Total null values per feature: 
 bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64


In [21]:
print("Percentage of null values per feature: \n")
data.isnull().sum().sort_values(ascending=False)/len(data)


Percentage of null values per feature: 



bmi                  0.039335
id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [22]:
#determine imbalances in target, if any (these affect results of dummy classifier)
data.stroke.value_counts(normalize=True)

0    0.951272
1    0.048728
Name: stroke, dtype: float64

95% of targets in dataset show patients are not likely to have a stroke. not good as this means the model itself wont predict things very accurately. hence dummy classifier score being so high (it literally reflects the skew).

# Conduct dummy test for baseline score, cross validate baseline model

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [24]:
from sklearn.dummy import DummyClassifier

baseline_model = DummyClassifier()
baseline_model.fit(X_train, y_train)
baseline_model.score(X_test, y_test) 

0.9517286366601435

In [25]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(baseline_model, X, y, cv=5)

In [26]:
pd.DataFrame(cv_results)

Unnamed: 0,fit_time,score_time,test_score
0,0.002543,0.000766,0.951076
1,0.001095,0.00075,0.951076
2,0.001233,0.000947,0.951076
3,0.001141,0.000594,0.951076
4,0.001328,0.000746,0.952055


# Make preprocessor, integrate model to create full pipeline

In [27]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

num_transformer = make_pipeline(SimpleImputer(), RobustScaler())
cat_transformer_1 = OneHotEncoder(drop='if_binary')

preproc = make_column_transformer(
    (num_transformer, ['age', 'bmi','avg_glucose_level']),
    (cat_transformer_1, ['gender', 'ever_married','work_type','Residence_type','smoking_status']),
    remainder='passthrough'
)

In [28]:
#balance dataset then return to this, or consider XGBoost model again

from sklearn.ensemble import GradientBoostingClassifier

pipeline = make_pipeline(preproc, GradientBoostingClassifier())
pipeline

In [29]:
# Train Pipeline
pipeline.fit(X_train,y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Score model
pipeline.score(X_test,y_test)

0.9484670580560991

# Balance data

In [51]:
stroke_df = data[['stroke']]
stroke_df

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
...,...
5105,0
5106,0
5107,0
5108,0


In [32]:
neg_labels = stroke_df[stroke_df["stroke"] == 0]
neg_labels

Unnamed: 0,stroke
249,0
250,0
251,0
252,0
253,0
...,...
5105,0
5106,0
5107,0
5108,0


In [33]:
pos_labels = stroke_df[stroke_df["stroke"] == 1]
pos_labels

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
...,...
244,1
245,1
246,1
247,1


In [144]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [145]:
cleaned_df = data.copy()

In [146]:
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)

In [147]:
train_labels = np.array(train_df.pop('stroke'))
train_labels

array([0, 0, 0, ..., 0, 0, 0])

In [148]:
test_labels = np.array(test_df.pop('stroke'))
test_labels

array([0, 0, 0, ..., 0, 0, 0])

In [149]:
bool_train_labels = train_labels != 0
bool_train_labels

array([False, False, False, ..., False, False, False])

In [150]:
train_features = np.array(train_df)
train_features

array([[56600, 'Female', 43.0, ..., 84.04, 30.6, 'Unknown'],
       [18283, 'Female', 51.0, ..., 81.38, 34.1, 'smokes'],
       [22282, 'Male', 52.0, ..., 116.62, nan, 'smokes'],
       ...,
       [38263, 'Female', 32.0, ..., 147.04, 35.7, 'Unknown'],
       [6304, 'Male', 48.0, ..., 79.2, 32.5, 'never smoked'],
       [13219, 'Male', 5.0, ..., 84.5, 15.8, 'Unknown']], dtype=object)

In [151]:
test_features = np.array(test_df)
test_features

array([[59359, 'Male', 79.0, ..., 105.93, 25.2, 'never smoked'],
       [33674, 'Female', 47.0, ..., 104.7, 20.7, 'smokes'],
       [59451, 'Male', 58.0, ..., 79.95, 25.9, 'never smoked'],
       ...,
       [67099, 'Male', 0.56, ..., 57.02, 20.7, 'Unknown'],
       [897, 'Male', 3.0, ..., 65.85, 17.0, 'Unknown'],
       [13223, 'Female', 53.0, ..., 86.39, 30.2, 'never smoked']],
      dtype=object)

In [152]:
pos_features = train_features[bool_train_labels]
neg_features = train_features[~bool_train_labels]

In [153]:
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns=train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns=train_df.columns)

In [154]:
BUFFER_SIZE = 100000

def make_ds(features, labels):
    ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
    ds = ds.shuffle(BUFFER_SIZE).repeat()
    return ds

pos_ds = make_ds(pos_features, pos_labels)
neg_ds = make_ds(neg_features, neg_labels)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [128]:
for features, label in pos_ds.take(1):
    print("Features:\n", features.numpy())
    print()
    print("Label: ", label.numpy())

NameError: name 'pos_ds' is not defined