This notebook was created to test out making a pipeline

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

original_data = pd.read_csv("healthcare-dataset-stroke-data.csv")

#Copy the original data into a new one and drop the N/A values
new_data = original_data.dropna()
#Get rid of ID column
del new_data[new_data.columns[0]]
#Drop "Other" in gender
new_data = new_data.drop(new_data.index[new_data["gender"] == "Other"])

new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4908 non-null   object 
 1   age                4908 non-null   float64
 2   hypertension       4908 non-null   int64  
 3   heart_disease      4908 non-null   int64  
 4   ever_married       4908 non-null   object 
 5   work_type          4908 non-null   object 
 6   Residence_type     4908 non-null   object 
 7   avg_glucose_level  4908 non-null   float64
 8   bmi                4908 non-null   float64
 9   smoking_status     4908 non-null   object 
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.1+ KB


Dealing with imbalanced dataset by removing all patients under 21.

In [2]:
new_data = new_data[new_data.age > 20]

In [26]:
new_data_train, new_data_test = train_test_split(new_data, train_size = 0.8, random_state=1)
new_data_train, new_data_val = train_test_split(new_data_train, train_size = 0.8, random_state=1)

print(new_data_train.shape)
print(new_data_val.shape)
print(new_data_test.shape)


(2500, 11)
(626, 11)
(782, 11)


In [28]:
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoder.fit_transform(new_data_train[cat_columns])

standard_scaler = StandardScaler()
standard_scaler.fit_transform(new_data_train[num_columns])

array([[ 0.22227603, -0.34868207,  3.94414647,  1.96923768,  1.14698193],
       [ 1.22949764, -0.34868207,  3.94414647,  0.36656723, -0.98550356],
       [ 0.87400766, -0.34868207, -0.25354028, -0.11017381, -1.90137874],
       ...,
       [-0.78494557, -0.34868207, -0.25354028, -0.43369955,  0.0670694 ],
       [-1.79216717, -0.34868207, -0.25354028, -0.30676805, -0.30201462],
       [-1.4366772 , -0.34868207, -0.25354028,  0.26591953,  0.85991657]])

Combining into the pipeline

In [29]:
num_pipeline = Pipeline([
    ("scale", StandardScaler())
])

one_hot_pipeline = Pipeline([
    ("one hot encode", OneHotEncoder())
])

preprocessing = ColumnTransformer([
    ("Numeric", num_pipeline, num_columns),
    ("One-Hot", one_hot_pipeline, cat_columns),
])


In [31]:
DTC_model = Pipeline([
    ('preprocessing', preprocessing),
    ('model', DecisionTreeClassifier())
])

DTC_model

In [30]:
X = new_data_train.drop('stroke', axis = 1)
y = new_data_train['stroke']

Make Decision Tree Classifier Model for testing

In [32]:
DTC_model.fit(X,y)

prediction = DTC_model.predict(new_data_val.drop('stroke', axis = 1))
print("Test using validation split")
print("Recall score:", recall_score(new_data_val['stroke'], prediction))
print(confusion_matrix(new_data_val['stroke'], prediction))
print("Test using test split")
test_prediction = DTC_model.predict(new_data_test.drop('stroke', axis = 1))
print("Recall score:", recall_score(new_data_test['stroke'], test_prediction))
print(confusion_matrix(new_data_test['stroke'], test_prediction))

Test using validation split
Recall score: 0.12195121951219512
[[552  33]
 [ 36   5]]
Test using test split
Recall score: 0.10638297872340426
[[696  39]
 [ 42   5]]
