# Heart Disease Prediction Model

### Importing libraries

In [None]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

### Loading the data

In [None]:
df = pd.read_csv("/content/heart_disease_dataset.csv")

In [None]:
print("\nFirst 5 rows of dataset")
print(df.head())

print("\nInfo")
print(df.info())

print("\nDescribtion")
print(df.describe(include="all"))

print("\nNull Values")
print(df.isnull().sum())


First 5 rows of dataset
    Age     Sex        Chest_pain  Blood_pressure  Cholesterol_level  \
0  56.0    Male      Asymptomatic           106.0              274.0   
1  48.0    Male  Non-anginal pain           133.0              206.0   
2  58.0  Female  Non-anginal pain           130.0              211.0   
3  68.0  Female   Atypical angina           134.0              219.0   
4  47.0    Male      Asymptomatic           149.0              248.0   

  Smoking_habit Exercise_habit Family_history Diabetes  Heart_Disease_Status  
0        Normal         Medium            Yes      Yes                     0  
1           Low         Medium             No       No                     1  
2           Low           High            Yes       No                     1  
3        Normal            Low            Yes       No                     1  
4           Low         Medium       Not sure       No                     1  

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entrie

### Transforming the alphabetical data to numeric

In [14]:
for col in ['Sex', 'Chest_pain', 'Smoking_habit', 'Exercise_habit', 'Family_history', 'Diabetes', 'Heart_Disease_Status']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

### Splitting the dataset for training and testing



In [15]:
X, y = df.loc[:, (df.columns != "Heart_Disease_Status")], df["Heart_Disease_Status"]


# We split the dataset into:
# - X_train, y_train: used to train the model - 80%
# - X_test, y_test: used to test the model's performance on new, unseen data- 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Initial Model

In [None]:
# Create an XGBoost classifier.
# random_state=42 ensures consistent, repeatable results each time you run it.

model = XGBClassifier(random_state=42)
# Train (fit) the model on the training data.
# The model learns patterns that connect the features (X_train) to the target (y_train).
model.fit(X_train, y_train)

# Use the trained model to make predictions on the test set (unseen data).
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.6192


### Hyperparameter tuning the model

In [None]:
para_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 9],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.5, 0.7, 0.9, 0.85, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.9, 0.85, 1.0],
    'reg_alpha': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'reg_lambda': [0, 0.001, 0.01, 0.1, 1, 1.3, 0.5, 0.7]
}

model = XGBClassifier(random_state=42)

rand_search = RandomizedSearchCV(model, para_grid, cv=10, scoring="accuracy", n_iter=100, n_jobs=-1, verbose=2)
rand_search.fit(X_train, y_train)

best_model = rand_search.best_estimator_
print("Best Model: ", best_model)
print("Best Score: ", rand_search.best_score_)
print("Best Params: ", rand_search.best_params_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Model:  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.2, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=5, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, ...)
Best Score:  0.630725
Best Params:  {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 100, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0

### Taking the input from the user and predicting the status of their heart disease (Yes/No)

In [16]:
def predict_user():
  try:
    print("Heart Disease Predictor")

    age = float(input("Enter Age: "))
    sex = float(input("Enter Sex (0 for female, 1 for male): "))
    print("Chest Pain Options: ")
    print("\nFor 'No' Enter 1")
    print("For 'Mostly' Enter 2")
    print("For 'Sometimes' Enter 3")
    print("For 'Always' Enter 0")
    chest_pain = float(input("Enter Chest Pain Type: "))
    bp = float(input("Enter your Blood Pressure: "))
    cholesterol = float(input("Enter your Cholesterol: "))
    smoking_habit = float(input("Enter your Smoking Habit (2 for Normal, 1 for Low, 0 for High): "))
    exercise_habit = float(input("Enter your Exercise Habit (2 for Medium, 1 for Low, 0 for High): "))
    fam_history = float(input("Do you have a family history of heart diseases? (2 for Yes, 0 for No, 1 for You'r not sure): "))
    diabetes = float(input("Do you have diabetes? (1 for Yes, 0 for No): "))

    user_df = pd.DataFrame([{
        'Age': age,
        'Sex': sex,
        'Chest_pain': chest_pain,
        'Blood_pressure': bp,
        'Cholesterol_level': cholesterol,
        'Smoking_habit': smoking_habit,
        'Exercise_habit': exercise_habit,
        'Family_history': fam_history,
        'Diabetes': diabetes
    }])

    prediction = best_model.predict(user_df)[0]

    print("\nPrediction:", "You have a heart disease" if prediction == 1 else "You do not have a heart disease")
  except Exception as e:
    print("Invalid Input")
predict_user()

Heart Disease Predictor
Enter Age: 58
Enter Sex (0 for female, 1 for male): 1
Chest Pain Options: 

For 'No' Enter 1
For 'Mostly' Enter 2
For 'Sometimes' Enter 3
For 'Always' Enter 0
Enter Chest Pain Type: 0
Enter your Blood Pressure: 150
Enter your Cholesterol: 240
Enter your Smoking Habit (2 for Normal, 1 for Low, 0 for High): 0
Enter your Exercise Habit (2 for Medium, 1 for Low, 0 for High): 1
Do you have a family history of heart diseases? (2 for Yes, 0 for No, 1 for You'r not sure): 2
Do you have diabetes? (1 for Yes, 0 for No): 1

Prediction: You have a heart disease


### Evaluating how good the model is

In [17]:

# Precision: Out of all the people the model said have the disease, how many actually do?
# Recall: Out of all the people who actually have the disease, how many did the model correctly catch?
# F1-Score: A balanced score that combines precision and recall into one number.
# ROC AUC: “How good the model is at telling sick people and healthy people apart.”


print("\nModel Evaluation Metrics using the initial model:")
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nAccuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_rep)


print("\nModel Evaluation Metrics using the best model:")
y_pred_1 = best_model.predict(X_test)
y_pred_proba_1 = best_model.predict_proba(X_test)[:, 1]

accuracy_1 = accuracy_score(y_test, y_pred_1)
classification_rep_1 = classification_report(y_test, y_pred_1)
roc_auc_1 = roc_auc_score(y_test, y_pred_proba_1)


print(f"\nAccuracy: {accuracy_1:.2f}")
print(f"ROC AUC Score: {roc_auc_1:.2f}")
print("\nClassification Report:")
print(classification_rep_1)


Model Evaluation Metrics using the initial model:

Accuracy: 0.62
ROC AUC Score: 0.66

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      5039
           1       0.62      0.62      0.62      4961

    accuracy                           0.62     10000
   macro avg       0.62      0.62      0.62     10000
weighted avg       0.62      0.62      0.62     10000


Model Evaluation Metrics using the best model:

Accuracy: 0.63
ROC AUC Score: 0.68

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      5039
           1       0.62      0.68      0.65      4961

    accuracy                           0.63     10000
   macro avg       0.63      0.63      0.63     10000
weighted avg       0.63      0.63      0.63     10000

