In [1]:
# importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split            # splitting data into 80-20
from sklearn.model_selection import GridSearchCV                # optimizing model parameters

In [2]:
# importing the dataset
data = pd.read_excel('Heart_Disease_Data.xlsx')
data.head()

Unnamed: 0,Age,Sex,CP,RestBP,Cholesteral,FBP,RestECG,Max_HR,ExAngina,Oldpeak,Slope,CA,Thal,Result
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Prelienary Analysis of given dataset

In [3]:
# checking for missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          303 non-null    int64  
 1   Sex          303 non-null    int64  
 2   CP           303 non-null    int64  
 3   RestBP       303 non-null    int64  
 4   Cholesteral  303 non-null    int64  
 5   FBP          303 non-null    int64  
 6   RestECG      303 non-null    int64  
 7   Max_HR       303 non-null    int64  
 8   ExAngina     303 non-null    int64  
 9   Oldpeak      303 non-null    float64
 10  Slope        303 non-null    int64  
 11  CA           303 non-null    int64  
 12  Thal         303 non-null    int64  
 13  Result       303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
data[['Age','RestBP','Cholesteral','Max_HR','Oldpeak']].describe()

Unnamed: 0,Age,RestBP,Cholesteral,Max_HR,Oldpeak
count,303.0,303.0,303.0,303.0,303.0
mean,54.366337,131.623762,246.264026,149.646865,1.039604
std,9.082101,17.538143,51.830751,22.905161,1.161075
min,29.0,94.0,126.0,71.0,0.0
25%,47.5,120.0,211.0,133.5,0.0
50%,55.0,130.0,240.0,153.0,0.8
75%,61.0,140.0,274.5,166.0,1.6
max,77.0,200.0,564.0,202.0,6.2


In [5]:
# categorical variables
print(data.Sex.value_counts().sort_index())
print(data.CP.value_counts().sort_index())
print(data.FBP.value_counts().sort_index())
print(data.RestECG.value_counts().sort_index())
print(data.ExAngina.value_counts().sort_index())
print(data.Slope.value_counts().sort_index())
print(data.CA.value_counts().sort_index())
print(data.Thal.value_counts().sort_index())
print(data.Result.value_counts().sort_index())

0     96
1    207
Name: Sex, dtype: int64
0    143
1     50
2     87
3     23
Name: CP, dtype: int64
0    258
1     45
Name: FBP, dtype: int64
0    147
1    152
2      4
Name: RestECG, dtype: int64
0    204
1     99
Name: ExAngina, dtype: int64
0     21
1    140
2    142
Name: Slope, dtype: int64
0    175
1     65
2     38
3     20
4      5
Name: CA, dtype: int64
0      2
1     18
2    166
3    117
Name: Thal, dtype: int64
0    138
1    165
Name: Result, dtype: int64


In [6]:
# separating x's & y's
x = data.iloc[:,:-1]
y = data.Result

In [7]:
# splitting the data into train data(80%) & test data(20%)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=100)

In [8]:
y_train.count(),y_test.count()

(242, 61)

# Logistic Regression

In [9]:
Reg_model = LogisticRegression().fit(x_train,y_train)
Reg_accuracy = Reg_model.score(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# %accuracy
round(Reg_accuracy*100,2)

86.36

In [11]:
# % mis-classification
Reg_misclassification = 100 - round(Reg_accuracy*100,2)
round(Reg_misclassification,2)

13.64

In [12]:
# Actual Vs Predicted Matrix
Reg_pred = Reg_model.predict(x_train)
Reg_table = pd.crosstab(y_train, Reg_pred)
Reg_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,80,25
1,8,129


In [13]:
# Model performance on test data
Reg_accuracy_test = Reg_model.score(x_test,y_test)

In [14]:
# % accuracy
round(Reg_accuracy_test*100,2)

85.25

In [15]:
# % mis-classification
Reg_misclassification_test = 100 - round(Reg_accuracy_test*100,2)
round(Reg_misclassification_test,2)

14.75

In [16]:
# Actual Vs Predicted Matrix
Reg_pred_test = Reg_model.predict(x_test)
Reg_table_test = pd.crosstab(y_test, Reg_pred_test)
Reg_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26,7
1,2,26


# Classification Tree

In [17]:
# assuming minimum samples split to be 25
CT_model = tree.DecisionTreeClassifier(min_samples_split=25).fit(x_train,y_train)
CT_accuracy = CT_model.score(x_train,y_train)
round(CT_accuracy*100,2)

85.54

In [18]:
# Hyper parameter tuning 
parameters = [{'criterion':['gini','entropy'],'min_samples_split':[5,10,15,20,25,30,35]}]
# optimum search
search=GridSearchCV(CT_model, parameters, scoring='accuracy', cv=5, verbose=True, n_jobs=-1).fit(x_train,y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    3.4s finished


In [19]:
# optimum parameter values
search.best_params_

{'criterion': 'gini', 'min_samples_split': 10}

In [20]:
# specifying the optimum Classification Tree model
CT_model = tree.DecisionTreeClassifier(criterion='gini', min_samples_split=10).fit(x_train,y_train)
CT_accuracy = CT_model.score(x_train,y_train)
round(CT_accuracy*100,2)

92.56

In [21]:
CT_misclassification = 100 - round(CT_accuracy*100,2)
round(CT_misclassification,2)

7.44

In [22]:
# Actual Vs Predicted Matrix
CT_pred = CT_model.predict(x_train)
CT_table = pd.crosstab(y_train, CT_pred)
CT_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,12
1,6,131


In [23]:
# Model performance on test data
CT_accuracy_test = CT_model.score(x_test,y_test)
round(CT_accuracy_test*100,2)

77.05

In [24]:
CT_misclassification_test = 100 - round(CT_accuracy_test*100,2)
round(CT_misclassification_test,2)

22.95

In [25]:
# Actual Vs Predicted Matrix
CT_pred_test = CT_model.predict(x_test)
CT_table_test = pd.crosstab(y_test, CT_pred_test)
CT_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23,10
1,4,24


# Bagging

In [26]:
# assuming number of estimators as 500 & minimum samples split to be 25
Bag_model = RandomForestClassifier(n_estimators=500, min_samples_split=25, max_features=None).fit(x_train,y_train)
Bag_accuracy = Bag_model.score(x_train,y_train)
round(Bag_accuracy*100,2)

89.67

In [27]:
# Hyper parameter tuning 
parameters = [{'criterion':['gini','entropy'],'n_estimators':[100,200,300,400,500,600],'min_samples_split':[5,10,15,20,25,30,35]}]
# optimum search
search = GridSearchCV(Bag_model, parameters, scoring='accuracy', cv=5, verbose=True, n_jobs=-1).fit(x_train,y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:  1.6min finished


In [28]:
search.best_params_

{'criterion': 'entropy', 'min_samples_split': 10, 'n_estimators': 100}

In [29]:
# specifying the optimum bagging model
Bag_model = RandomForestClassifier(criterion='entropy', n_estimators=100, min_samples_split=10, max_features=None).fit(x_train,y_train)
Bag_accuracy = Bag_model.score(x_train,y_train)
round(Bag_accuracy*100,2)

95.87

In [30]:
Bag_misclassification = 100 - round(Bag_accuracy*100,2)
round(Bag_misclassification,2)

4.13

In [31]:
# Actual Vs Predicted Matrix
Bag_pred = Bag_model.predict(x_train)
Bag_table = pd.crosstab(y_train, Bag_pred)
Bag_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,98,7
1,3,134


In [32]:
# Model performance on test data
Bag_accuracy_test = Bag_model.score(x_test,y_test)
round(Bag_accuracy_test*100,2)

85.25

In [33]:
Bag_misclassification_test = 100 - round(Bag_accuracy_test*100,2)
round(Bag_misclassification_test,2)

14.75

In [34]:
# Actual Vs Predicted Matrix
Bag_pred_test = Bag_model.predict(x_test)
Bag_table_test = pd.crosstab(y_test, Bag_pred_test)
Bag_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,8
1,1,27


# Random Forest

In [35]:
# assuming number of estimators as 500 & minimum samples split to be 25
RF_model = RandomForestClassifier(n_estimators=500, min_samples_split=25, max_features='sqrt').fit(x_train,y_train)
RF_accuracy = RF_model.score(x_train,y_train)
round(RF_accuracy*100,2)

89.67

In [36]:
# Hyper parameter tuning 
parameters = [{'criterion':['gini','entropy'],'n_estimators':[100,200,300,400,500,600],'min_samples_split':[5,10,15,20,25,30,35]}]
# optimum search
search = GridSearchCV(RF_model, parameters, scoring='accuracy', cv=5, verbose=True, n_jobs=-1).fit(x_train,y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:  1.4min finished


In [37]:
search.best_params_

{'criterion': 'entropy', 'min_samples_split': 20, 'n_estimators': 200}

In [38]:
# specifying the optimum Random Forest model
RF_model = RandomForestClassifier(criterion='entropy', n_estimators=200, min_samples_split=20,max_features='sqrt').fit(x_train,y_train)
RF_accuracy = RF_model.score(x_train,y_train)
round(RF_accuracy*100,2)

91.32

In [39]:
RF_misclassification = 100 - round(RF_accuracy*100,2)
round(RF_misclassification,2)

8.68

In [40]:
# Actual Vs Predicted Matrix
RF_pred = RF_model.predict(x_train)
RF_table = pd.crosstab(y_train, RF_pred)
RF_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89,16
1,5,132


In [41]:
# Model performance on test data
RF_accuracy_test = RF_model.score(x_test,y_test)
round(RF_accuracy_test*100,2)

86.89

In [42]:
RF_misclassification_test = 100 - round(RF_accuracy_test*100,2)
round(RF_misclassification_test,2)

13.11

In [43]:
# Actual Vs Predicted Matrix
RF_pred_test = RF_model.predict(x_test)
RF_table_test = pd.crosstab(y_test, RF_pred_test)
RF_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26,7
1,1,27


# Naive Bayes 

In [44]:
NB_model = GaussianNB().fit(x_train,y_train)

In [45]:
NB_accuracy = NB_model.score(x_train,y_train)
round(NB_accuracy*100,2)

83.88

In [46]:
NB_misclassification = 100 - round(NB_accuracy*100,2)
round(NB_misclassification,2)

16.12

In [47]:
# Actual Vs Predicted Matrix
NB_pred = NB_model.predict(x_train)
NB_table = pd.crosstab(y_train, NB_pred)
NB_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,24
1,15,122


In [48]:
# Model performance on test data
NB_accuracy_test = NB_model.score(x_test,y_test)
round(NB_accuracy_test*100,2)

86.89

In [49]:
NB_misclassification_test = 100 - round(NB_accuracy_test*100,2)
round(NB_misclassification_test,2)

13.11

In [50]:
# Actual Vs Predicted Matrix
NB_pred_test = NB_model.predict(x_test)
NB_table_test = pd.crosstab(y_test, NB_pred_test)
NB_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,28,5
1,3,25


# K Nearest Neighbors

In [51]:
# assuming n neighbors as 5
KNN_model = KNeighborsClassifier(n_neighbors=5).fit(x_train,y_train)
KNN_accuracy = KNN_model.score(x_train,y_train)
round(KNN_accuracy*100,2)

75.21

In [52]:
# hyperparameter tuning
parameters = [{'n_neighbors':[3,4,5,6,7,8,9,10,11,12,13,14,15]}]
# optimum search
search = GridSearchCV(KNN_model, parameters, scoring='accuracy',cv=5,verbose=True).fit(x_train,y_train)

Fitting 5 folds for each of 13 candidates, totalling 65 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  65 out of  65 | elapsed:    0.4s finished


In [53]:
search.best_params_

{'n_neighbors': 15}

In [54]:
# specifying the optimum model
KNN_model = KNeighborsClassifier(n_neighbors=15).fit(x_train,y_train)
KNN_accuracy = KNN_model.score(x_train,y_train)
round(KNN_accuracy*100,2)

66.12

In [55]:
KNN_misclassification = 100 - round(KNN_accuracy*100,2)
round(KNN_misclassification,2)

33.88

In [56]:
# Actual Vs Predicted Matrix
KNN_pred = KNN_model.predict(x_train)
KNN_table = pd.crosstab(y_train, KNN_pred)
KNN_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50,55
1,27,110


In [57]:
# Model performance on test data
KNN_accuracy_test = KNN_model.score(x_test,y_test)
round(KNN_accuracy_test*100,2)

67.21

In [58]:
KNN_misclassification_test = 100 - round(KNN_accuracy_test*100,2)
round(KNN_misclassification_test,2)

32.79

In [60]:
# Actual Vs Predicted Matrix
KNN_pred_test = KNN_model.predict(x_test)
KNN_table_test = pd.crosstab(y_test, KNN_pred_test)
KNN_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,16
1,4,24


# Support Vector Machine

In [61]:
SVM_model = svm.SVC().fit(x_train,y_train)

In [62]:
# hyperparameter tuning
parameters = [{'kernel': ['linear','poly','rbf'],'C': [0.001,0.01,0.1,0,1,5,10,100],'degree':[1,2,3]}]
# optimum search
search = GridSearchCV(SVM_model, parameters, scoring='accuracy', cv=5, verbose=True).fit(x_train,y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\K.KALYANKUMAR\Downloads\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\K.KALYANKUMAR\Downloads\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\Users\K.KALYANKUMAR\Downloads\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 268, in _dense_fit
    self._probB, self.fit_status_ = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 191, in sklearn.svm._libsvm.fit
ValueError: C <= 0

Traceback (most recent call last):
  File "C:\Users\K.KALYANKUMAR\Downloads\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\K.KALYANKUMAR\Downloads\Anac

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  2.6min finished


In [63]:
search.best_params_

{'C': 10, 'degree': 1, 'kernel': 'linear'}

In [64]:
# specifying the optimum model
SVM_model = svm.SVC(kernel='linear', C=10, degree=1, probability=True).fit(x_train,y_train)               
SVM_accuracy = SVM_model.score(x_train,y_train)
round(SVM_accuracy*100,2)

86.36

In [65]:
SVM_misclassification = 100 - round(SVM_accuracy*100,2)
round(SVM_misclassification,2)

13.64

In [66]:
# Actual Vs Predicted Matrix
SVM_pred = SVM_model.predict(x_train)
SVM_table = pd.crosstab(y_train, SVM_pred)
SVM_table

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,82,23
1,10,127


In [67]:
# Model performance on test data
SVM_accuracy_test = SVM_model.score(x_test,y_test)
round(SVM_accuracy_test*100,2)

80.33

In [68]:
SVM_misclassification_test = 100 - round(SVM_accuracy_test*100,2)
round(SVM_misclassification_test,2)

19.67

In [69]:
# Actual Vs Predicted Matrix
SVM_pred_test = SVM_model.predict(x_test)
SVM_table_test = pd.crosstab(y_test, SVM_pred_test)
SVM_table_test

col_0,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,8
1,4,24
