This notebook was created to test out making a pipeline

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
original_data = pd.read_csv("healthcare-dataset-stroke-data.csv")

#Copy the original data into a new one and drop the N/A values
new_data = original_data.dropna()
#Get rid of ID column
del new_data[new_data.columns[0]]
#Drop "Other" in gender
new_data = new_data.drop(new_data.index[new_data["gender"] == "Other"])

In [2]:
new_data_train, new_data_test = train_test_split(new_data, train_size = 0.8, random_state=1)
new_data_train, new_data_val = train_test_split(new_data_train, train_size = 0.8, random_state=1)

new_data_test.info() 

#train 3140 
#val 786 
#test 982

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 1327 to 4412
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             982 non-null    object 
 1   age                982 non-null    float64
 2   hypertension       982 non-null    int64  
 3   heart_disease      982 non-null    int64  
 4   ever_married       982 non-null    object 
 5   work_type          982 non-null    object 
 6   Residence_type     982 non-null    object 
 7   avg_glucose_level  982 non-null    float64
 8   bmi                982 non-null    float64
 9   smoking_status     982 non-null    object 
 10  stroke             982 non-null    int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 92.1+ KB


In [3]:
import joblib

def transfer_model(pipeline):
    return joblib.dump(pipeline, 'pipeline_for_app.pkl')

In [4]:
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoder.fit_transform(new_data_train[cat_columns])

standard_scaler = StandardScaler()
standard_scaler.fit_transform(new_data_train[num_columns])

array([[ 0.93841456, -0.31104929,  4.35889894, -0.49698966,  0.05966061],
       [ 1.0274205 , -0.31104929, -0.22941573,  2.15351315,  0.71481042],
       [ 0.13736111, -0.31104929, -0.22941573, -0.43050171, -1.0065636 ],
       ...,
       [ 0.27087002, -0.31104929,  4.35889894, -0.63469861, -0.81387248],
       [ 0.04835517, -0.31104929, -0.22941573, -0.39331353, -0.31287556],
       [-0.17415968,  3.21492458, -0.22941573, -1.05593923,  1.30572987]])

In [5]:
num_pipeline = Pipeline([
    ("scale", StandardScaler())
])

one_hot_pipeline = Pipeline([
    ("one hot encode", OneHotEncoder())
])

preprocessing = ColumnTransformer([
    ("Numeric", num_pipeline, num_columns),
    ("One-Hot", one_hot_pipeline, cat_columns),
])

DTC_model = Pipeline([
    ('preprocessing', preprocessing),
    ('model', DecisionTreeClassifier())
])

DTC_model

In [6]:
X = new_data_train.drop('stroke', axis = 1)
y = new_data_train['stroke']

DTC_model.fit(X,y)

prediction = DTC_model.predict(new_data_val.drop('stroke', axis = 1))

print(prediction)

print(recall_score(new_data_val['stroke'], prediction))

print(confusion_matrix(new_data_val['stroke'], prediction))

test_prediction = DTC_model.predict(new_data_test.drop('stroke', axis = 1))

print(recall_score(new_data_test['stroke'], test_prediction))

print(confusion_matrix(new_data_test['stroke'], test_prediction))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 