<a href="https://colab.research.google.com/github/jivanjyotigiri2003/ML-Projects/blob/main/ML_Model_for_Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Heart Disease Prediction using various ML Algorithms
**Language used:- Python**

Importing necessary Dependencies from various libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# importing the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

Data Collection and Pre-processing

In [None]:
heart_data = pd.read_csv('/content/heart.csv')

In [None]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0


In [None]:
heart_data.shape

(1025, 14)

In [None]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [None]:
#checking for null values
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:
heart_data['target'].value_counts()

target
1    526
0    499
Name: count, dtype: int64

1 ---> Defective Heart
0 ---> Healthy Heart

Splitting of Data

In [None]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
print(X)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  
0         2   2     3  
1         0   0     3  
2  

In [None]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


Splitting of Training and Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1025, 13) (820, 13) (205, 13)


Feature Scaling:- Feature scaling is a preprocessing technique to bring different features of the dataset onto a similar scale. This process helps in improving the performance and training speed of machine learning algorithms.

In [None]:
sc = StandardScaler()
X_train.iloc[:, [3, 4, 7, 9]] = sc.fit_transform(X_train.iloc[:, [3, 4, 7, 9]])
X_test.iloc[:, [3, 4, 7, 9]] = sc.transform(X_test.iloc[:, [3, 4, 7, 9]])

In [None]:
print(X_train)

      age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
14     52    1   0 -0.219744 -0.830508    1        1  0.306472      1   
586    64    1   2 -0.395140  1.193573    0        1 -0.780497      1   
16     51    0   2  0.481840  1.174296    0        0 -0.302230      0   
462    52    1   3 -0.804398 -1.177493    0        0  1.784750      0   
186    40    1   0 -1.272121 -1.543756    0        0 -1.519636      1   
...   ...  ...  ..       ...       ...  ...      ...       ...    ...   
221    63    1   0 -0.102814  0.133340    0        0 -0.084837      0   
906    42    1   0  0.481840 -0.406415    0        1  1.263005      0   
1002   66    1   0 -1.155191 -0.676292    0        0 -0.737018      1   
966    58    1   0 -0.219744  0.229725    0        0 -0.823976      1   
191    56    1   1 -0.102814 -0.502800    0        0  0.610824      0   

       oldpeak  slope  ca  thal  
14   -0.075974      1   0     0  
586   0.594807      1   0     3  
16    0.343264      2

In [None]:
print(X_test)

     age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
845   56    1   0  0.014117 -1.216047    0        0 -1.910945      1   
542   62    0   0  0.481840  2.832115    0        0  0.349951      0   
147   41    1   0 -1.272121 -1.447371    0        0  0.393430      0   
711   35    1   0 -0.687467 -0.946170    0        1 -0.823976      1   
724   74    0   1 -0.687467  0.422495    0        0 -1.215284      1   
..   ...  ...  ..       ...       ...  ...      ...       ...    ...   
889   63    0   0  1.066494  3.082715    0        0  0.219515      0   
563   44    1   0 -1.155191  0.827311    0        0  0.176036      0   
654   41    0   1 -0.336675  1.135742    0        1  0.610824      0   
943   65    1   0 -0.687467 -1.350986    0        1 -0.389188      0   
734   52    1   0 -0.219744 -0.830508    1        1  0.306472      1   

      oldpeak  slope  ca  thal  
845  0.846350      1   1     1  
542  0.091721      1   0     2  
147 -0.914451      2   0     3  
711

K-fold Cross Validation:- The dataset is divided into k equal-sized folds (subsets).The model is trained on k-1 folds and validated on the remaining fold.
This process is repeated k times, with each fold being used exactly once as the validation set.The results are averaged to produce a single performance metric.

In [None]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0), DecisionTreeClassifier(random_state=0), GaussianNB(), XGBClassifier()]

In [None]:
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X_train, Y_train, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for the',model,'=', cv_score)
    print('Acccuracy score of the ',model,'=',mean_accuracy,'%')
    print('---------------------------------------------------------------')

In [None]:
compare_models_cross_validation()

Cross Validation accuracies for the LogisticRegression(max_iter=1000) = [0.85365854 0.87195122 0.83536585 0.84756098 0.85365854]
Acccuracy score of the  LogisticRegression(max_iter=1000) = 85.24 %
---------------------------------------------------------------
Cross Validation accuracies for the SVC(kernel='linear') = [0.84146341 0.87804878 0.83536585 0.84146341 0.89634146]
Acccuracy score of the  SVC(kernel='linear') = 85.85 %
---------------------------------------------------------------
Cross Validation accuracies for the KNeighborsClassifier() = [0.80487805 0.81097561 0.85365854 0.79878049 0.83536585]
Acccuracy score of the  KNeighborsClassifier() = 82.07 %
---------------------------------------------------------------
Cross Validation accuracies for the RandomForestClassifier(random_state=0) = [0.9695122  0.98170732 0.98780488 0.97560976 0.99390244]
Acccuracy score of the  RandomForestClassifier(random_state=0) = 98.17 %
----------------------------------------------------------

**Hyperparameter tuning** is the process of optimizing the hyperparameters of a machine learning model to improve its performance. Hyperparameters are the parameters that are set before the learning process begins and cannot be learned from the data.Hyperparameter tuning can be done using GridSearch CV or RandomizedSearch CV.

Grid Search CV:      
->Exhaustively searches over a predefined hyperparameter space.    
->Evaluates all possible combinations of hyperparameter values.       
->Simple but can be computationally expensive, especially with a large number of hyperparameters or possible values.

RandomizedSearch CV:        
->Randomly samples hyperparameters from a specified distribution.        
->Often more efficient than grid search, especially when only a small number of hyperparameter combinations yield good results.

Hyperparameter Tuning using GridSearch CV

In [None]:
# list of models
models = [LogisticRegression(), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state=0), DecisionTreeClassifier(random_state=0), GaussianNB(), XGBClassifier()]

In [None]:
model_hyperparameters = {


    'log_reg_hyperparameters': {

        'C' : [1,5,10,20],
        'max_iter' : [100,500,1000],
        'solver': ['lbfgs', 'saga', 'liblinear']
    },

    'svc_hyperparameters': {

        'kernel' : ['linear','poly','rbf','sigmoid'],
        'C' : [1,5,10,20]
    },


    'KNN_hyperparameters' : {

        'n_neighbors' : [3,5,10]
    },


    'random_forest_hyperparameters' : {

        'n_estimators' : [10, 20, 50, 100],
        'criterion' : ['gini', 'entropy'],
        'max_depth' : [3, 5, 7],
        'random_state' : [0],
        'max_features' : ['sqrt', 'log2']
    },


    'decision_tree_hyperparameters' : {

        'criterion' : ['gini', 'entropy'],
        'max_depth' : [3, 5, 7]
    },

    'naive_bayes_hyperparameters' : {

        'var_smoothing' : [1e-9, 1e-8, 1e-7]
    },

    'xgb_hyperparameters' : {

    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 6, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2]
    }
}

In [None]:
type(model_hyperparameters)

dict

In [None]:
print(model_hyperparameters.keys())

dict_keys(['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters', 'decision_tree_hyperparameters', 'naive_bayes_hyperparameters', 'xgb_hyperparameters'])


In [None]:
model_keys = list(model_hyperparameters.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters', 'decision_tree_hyperparameters', 'naive_bayes_hyperparameters', 'xgb_hyperparameters']


Applying GridSearchCV

In [None]:
def ModelSelection(list_of_models, hyperparameters_dictionary):

  result = []

  i = 0

  for model in list_of_models:

    key = model_keys[i]

    params = hyperparameters_dictionary[key]

    i += 1

    print(model)
    print(params)
    print('---------------------------------')


    classifier =  RandomizedSearchCV(model, params, cv=5)

    # fitting the data to classifier
    classifier.fit(X_train,Y_train)

    result.append({
        'model used' : model,
        'highest score' : classifier.best_score_,
        'best hyperparameters' : classifier.best_params_
    })

  result_dataframe = pd.DataFrame(result, columns = ['model used','highest score','best hyperparameters'])

  return result_dataframe

Model Evaluation

In [None]:
ModelSelection(models, model_hyperparameters)

LogisticRegression()
{'C': [1, 5, 10, 20], 'max_iter': [100, 500, 1000], 'solver': ['lbfgs', 'saga', 'liblinear']}
---------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
---------------------------------
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
---------------------------------




RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7], 'random_state': [0], 'max_features': ['sqrt', 'log2']}
---------------------------------
DecisionTreeClassifier(random_state=0)
{'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7]}
---------------------------------




GaussianNB()
{'var_smoothing': [1e-09, 1e-08, 1e-07]}
---------------------------------
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
{'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 6, 7, 9], 'learning_rate': [0.01, 0.1, 0.2]}
---------------------------------




Unnamed: 0,model used,highest score,best hyperparameters
0,LogisticRegression(),0.856098,"{'solver': 'liblinear', 'max_iter': 100, 'C': 1}"
1,SVC(),0.859756,"{'kernel': 'linear', 'C': 5}"
2,KNeighborsClassifier(),0.904878,{'n_neighbors': 3}
3,RandomForestClassifier(random_state=0),0.958537,"{'random_state': 0, 'n_estimators': 100, 'max_..."
4,DecisionTreeClassifier(random_state=0),0.957317,"{'max_depth': 7, 'criterion': 'gini'}"
5,GaussianNB(),0.829268,{'var_smoothing': 1e-09}
6,"XGBClassifier(base_score=None, booster=None, c...",0.980488,"{'n_estimators': 500, 'max_depth': 9, 'learnin..."


Here we find that XG Boost classifier has a highest score of 0.98

In [None]:
model = XGBClassifier(learning_rate = 0.1, max_depth = 9, n_estimators = 500)

In [None]:
model.fit(X_train,Y_train)

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on training data',training_data_accuracy)

Accuracy on training data 1.0


In [None]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on testing data',testing_data_accuracy)

Accuracy on testing data 1.0


Precision, Recall and F1 Score

Precision = True Positives (TP)/(True Positives (TP)+False Positives (FP))

Recall= True Positives (TP)/(True Positives (TP)+False Negatives (FN))

F1 Score=(2×Precision×Recall)/(Precision+Recall)

In [None]:
def precision_recall_f1_score(true_labels, pred_labels):

  precision_value = precision_score(true_labels, pred_labels)
  recall_value = recall_score(true_labels, pred_labels)
  f1_score_value = f1_score(true_labels, pred_labels)

  print('Precision =',precision_value)
  print('Recall =',recall_value)
  print('F1 Score =',f1_score_value)

In [None]:
precision_recall_f1_score(X_train_prediction, Y_train)
print('-----------------------------------------------------')
precision_recall_f1_score(X_test_prediction, Y_test)

Precision = 1.0
Recall = 1.0
F1 Score = 1.0
-----------------------------------------------------
Precision = 1.0
Recall = 1.0
F1 Score = 1.0


Prediction System

In [None]:
scaled_features = [3, 4, 7, 9]

input_data = np.array([[52, 1, 3, 152, 298, 1, 1, 178, 0, 1.2, 1, 0, 3]])

# Scale the features
input_data[:, scaled_features] = sc.transform(input_data[:, scaled_features])

prediction = model.predict(input_data)
print(prediction)

if (prediction == 1):
  print('The person has heart disease')
else:
  print('The object does not have heart disease')



[1]
The person has heart disease


