# Heart disease 
Model to predict occurence of heart failure/attack based on provided health metrics/data

**Model type: Decision tree**

**Added Improvements(XGBoost, Pipelines)**

In [2]:
import xgboost as xgb
import pandas as pd 

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 


In [3]:
# heart_data = pd.read_csv('heart.csv')
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
required_features = ['gender', 'age', 'hypertension', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'heart_disease']
heart_data = data[required_features]
heart_data

Unnamed: 0,gender,age,hypertension,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,heart_disease
0,Male,67.0,0,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,Yes,Self-employed,Rural,202.21,,never smoked,0
2,Male,80.0,0,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,Yes,Private,Urban,171.23,34.4,smokes,0
4,Female,79.0,1,Yes,Self-employed,Rural,174.12,24.0,never smoked,0
...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [12]:
heart_data['heart_disease'].value_counts()

heart_disease
0    4834
1     276
Name: count, dtype: int64

In [8]:
num_cols = ['age', 'bmi', 'hypertension', 'avg_glucose_level']
# cat_cols = ['RestingECG', 'ChestPainType', 'Sex', 'ExerciseAngina', 'ST_Slope']
cat_cols = ['gender', 'ever_married', 'Residence_type', 'work_type', 'smoking_status']

num_imputer = SimpleImputer(strategy='constant')

c_transformer = Pipeline(steps=[
    ('imputer', (SimpleImputer(strategy='most_frequent'))),
    ('onehot', (OneHotEncoder(handle_unknown='ignore', sparse=False))),
    ('enc', (OrdinalEncoder()))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_cols),
        ('cat', c_transformer, cat_cols)
        
    ]
)

In [9]:
x = heart_data.drop(columns=['heart_disease'])
y = heart_data['heart_disease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

model = xgb.XGBClassifier(
                     objective='binary:logistic',
                     max_depth=5,
                     n_estimators=100,
                     random_state=42,
                     learning_rate=0.1,
                )

In [10]:
my_pipeline = Pipeline(steps=[
     ('preprocessor', preprocessor),
     ('model', model)
])

my_pipeline.fit(x_train, y_train)
heart_disease_prediction = my_pipeline.predict(x_test)
detection_accuracy = my_pipeline.score(x_test, y_test)

detection_accuracy



0.9471624266144814

In [None]:
# heart_data['Sex'] = (heart_data['Sex'] == 'M').astype(int)
# heart_data['ExerciseAngina'] = (heart_data['ExerciseAngina'] == 'Y').astype(int)
# heart_data['ST_Slope'] = (heart_data['ST_Slope'] == 'Up').astype(int)

# encoder = OneHotEncoder(handle_unknown='ignore')
# encoded = encoder.fit_transform(data[['RestingECG', 'ChestPainType']]).toarray()

# heart_data.update(encoded)

# heart_data


In [11]:
metrics_report = classification_report(y_test, heart_disease_prediction)

print(metrics_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.00      0.00      0.00        54

    accuracy                           0.95      1022
   macro avg       0.47      0.50      0.49      1022
weighted avg       0.90      0.95      0.92      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
