Importing the dependencies

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# importing the ML models

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Data Collection and Pre-processing

In [6]:
heart_data = pd.read_csv("/content/heart_disease_data.csv")

In [7]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
heart_data.shape

(303, 14)

In [9]:
heart_data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,165
0,138


Splitting the data into features and target

In [10]:
X = heart_data.drop(columns = "target", axis = 1)
Y = heart_data["target"]

In [11]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  
0        0   0     1  
1        0   0     2  
2        2   0    

In [12]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64


Model Selection

In [13]:
X = np.asarray(X)
Y = np.asarray(Y)

Model Selection with cross_val_score

In [14]:
models = [LogisticRegression(max_iter = 1000), SVC(kernel = 'linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [15]:
def compare_different_models():

  for model in models:
    cv_score = cross_val_score(model ,X,Y, cv = 5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy,2)
    print("The cross validation score for", model, " = " ,mean_accuracy)
    print("-----------------------------------------------------------")

In [16]:
compare_different_models()

The cross validation score for LogisticRegression(max_iter=1000)  =  82.83
-----------------------------------------------------------
The cross validation score for SVC(kernel='linear')  =  82.83
-----------------------------------------------------------
The cross validation score for KNeighborsClassifier()  =  64.39
-----------------------------------------------------------
The cross validation score for RandomForestClassifier(random_state=0)  =  83.81
-----------------------------------------------------------


Select the model with highest accuracy

Comparing the models with different hyperparameters using GridSearchCV

In [23]:
models_list = RandomForestClassifier()

In [24]:
hyperparameters = {
                  'n_estimators': [10, 20, 50, 100]
}

In [25]:
classifier = GridSearchCV(models_list, hyperparameters, cv = 5)

In [26]:
classifier.fit(X,Y)

In [27]:
classifier.cv_results_

{'mean_fit_time': array([0.01796794, 0.03544621, 0.08190603, 0.19821897]),
 'std_fit_time': array([0.00080637, 0.00172215, 0.00518936, 0.04518651]),
 'mean_score_time': array([0.00202756, 0.00258231, 0.00448389, 0.01043887]),
 'std_score_time': array([9.63871088e-05, 2.01372515e-04, 1.66001966e-04, 2.79613652e-03]),
 'param_n_estimators': masked_array(data=[10, 20, 50, 100],
              mask=[False, False, False, False],
        fill_value=999999),
 'params': [{'n_estimators': 10},
  {'n_estimators': 20},
  {'n_estimators': 50},
  {'n_estimators': 100}],
 'split0_test_score': array([0.80327869, 0.81967213, 0.81967213, 0.85245902]),
 'split1_test_score': array([0.85245902, 0.8852459 , 0.86885246, 0.8852459 ]),
 'split2_test_score': array([0.78688525, 0.7704918 , 0.78688525, 0.81967213]),
 'split3_test_score': array([0.71666667, 0.85      , 0.8       , 0.85      ]),
 'split4_test_score': array([0.71666667, 0.78333333, 0.8       , 0.76666667]),
 'mean_test_score': array([0.77519126, 0.8

In [28]:
classifier.best_params_

{'n_estimators': 100}

Train Test Split

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [33]:
print(X.shape, X_train.shape, X_test.shape)

(303, 13) (242, 13) (61, 13)


In [34]:
models_list.fit(X_train, Y_train)

Model Evaluation

In [36]:
X_train_prediction = models_list.predict(X_train)
X_train_accuracy = accuracy_score(X_train_prediction, Y_train)
print(X_train_accuracy)

1.0


In [37]:
X_test_prediction = models_list.predict(X_test)
X_test_accuracy = accuracy_score(X_test_prediction, Y_test)
print(X_test_accuracy)

0.7868852459016393


Building a predictive model

In [39]:
#to get better result could have standardized the data

input_data = (44,1,1,120,263,0,1,173,0,0,2,0,3)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = models_list.predict(input_data_reshaped)

if(prediction[0] == 0):
  print("The person does not have heart disease")
else:
  print("The person has heart disease")

The person has heart disease
