In [98]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score

In [99]:
# Data preprocessing

df = pd.read_csv("heart.csv.xls")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [101]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [102]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [103]:
df_headers = np.array(df.columns)
print(df_headers)

['Age' 'Sex' 'ChestPainType' 'RestingBP' 'Cholesterol' 'FastingBS'
 'RestingECG' 'MaxHR' 'ExerciseAngina' 'Oldpeak' 'ST_Slope' 'HeartDisease']


In [104]:
print(pd.unique(df["Sex"]))

['M' 'F']


In [105]:
print(pd.unique(df["ChestPainType"]))
chestPain_map = dict()

for i in range(0,len(pd.unique(df["ChestPainType"]))):
  if pd.unique(df["ChestPainType"])[i] not in chestPain_map:
    chestPain_map[i] = pd.unique(df["ChestPainType"])[i]

print(chestPain_map)

for i in range(0,len(df["ChestPainType"])):
  for k,v in chestPain_map.items():
     if v == df["ChestPainType"][i]:
        df["ChestPainType"][i] = k

print(df["ChestPainType"])

['ATA' 'NAP' 'ASY' 'TA']
{0: 'ATA', 1: 'NAP', 2: 'ASY', 3: 'TA'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ChestPainType"][i] = k


0      0
1      1
2      0
3      2
4      1
      ..
913    3
914    2
915    2
916    0
917    1
Name: ChestPainType, Length: 918, dtype: object


In [106]:
print(pd.unique(df['RestingECG']))
restingECG_map = dict()

for i in range(0,len(pd.unique(df["RestingECG"]))):
  if pd.unique(df["RestingECG"])[i] not in restingECG_map:
    restingECG_map[i] = pd.unique(df["RestingECG"])[i]

print(restingECG_map)

for i in range(0,len(df["RestingECG"])):
  for k,v in restingECG_map.items():
     if v == df["RestingECG"][i]:
        df["RestingECG"][i] = k

print(df["RestingECG"])


['Normal' 'ST' 'LVH']
{0: 'Normal', 1: 'ST', 2: 'LVH'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["RestingECG"][i] = k


0      0
1      0
2      1
3      0
4      0
      ..
913    0
914    0
915    0
916    2
917    0
Name: RestingECG, Length: 918, dtype: object


In [107]:
print(pd.unique(df["ExerciseAngina"]))

['N' 'Y']


In [108]:
print(pd.unique(df["ST_Slope"]))
st_slope_map = dict()

for i in range(0,len(pd.unique(df["ST_Slope"]))):
  if pd.unique(df["ST_Slope"])[i] not in st_slope_map:
    st_slope_map[i] = pd.unique(df["ST_Slope"])[i]

print(st_slope_map)

for i in range(0,len(df["ST_Slope"])):
  for k,v in st_slope_map.items():
     if v == df["ST_Slope"][i]:
        df["ST_Slope"][i] = k

print(df["ST_Slope"])



['Up' 'Flat' 'Down']
{0: 'Up', 1: 'Flat', 2: 'Down'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ST_Slope"][i] = k


0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: ST_Slope, Length: 918, dtype: object


In [109]:
# Encoding the categorical variables

label_encoder_s = LabelEncoder()
label_encoder_e = LabelEncoder()
df["Sex"] = label_encoder_s.fit_transform(df["Sex"])
df["ExerciseAngina"] = label_encoder_e.fit_transform(df["ExerciseAngina"])

In [110]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values


In [111]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0


In [112]:
X.shape

(918, 11)

In [113]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)

In [114]:
# Feature Scaling

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test =  sc_x.transform(X_test)

In [115]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0


# LOGISTIC REGRESSION

In [116]:
log_regressor = LogisticRegression()
log_regressor.fit(X_train,y_train)

y_log_pred = log_regressor.predict(X_test)



#K Nearest Neighbors


In [117]:
knn_classifier = KNeighborsClassifier(n_neighbors=15,metric = "minkowski")
knn_classifier.fit(X_train,y_train)

y_knn_pred = knn_classifier.predict(X_test)

# Naive Baye's


In [118]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train,y_train)

y_nb_pred = nb_classifier.predict(X_test)

# Support Vector Classifier


In [119]:
svc_classifier = SVC(kernel = "rbf")
svc_classifier.fit(X_train,y_train)

y_svc_pred = svc_classifier.predict(X_test)

#Decision Tree Classifier

In [120]:
dt_classifier = DecisionTreeClassifier(random_state=0)
dt_classifier.fit(X_train,y_train)

y_dt_pred = dt_classifier.predict(X_test)

# Random Forest Classifier

In [121]:
rf_classifier = RandomForestClassifier(n_estimators=100,random_state=0)
rf_classifier.fit(X_train,y_train)

y_rf_pred = rf_classifier.predict(X_test)


In [122]:
y_pred = [y_log_pred,y_knn_pred,y_nb_pred,y_dt_pred,y_rf_pred,y_svc_pred]
names = ["Logistic Regression","KNearest Neighbors Classifier","Naive Baye's Classifier","Decision Tree Classifier","Random Forest Classifier","Support Vecotr Classifier"]

for i in range(0,len(y_pred)):
  print("The confusion matrix for ",names[i]," is :\n",confusion_matrix(y_test,y_pred[i]))



The confusion matrix for  Logistic Regression  is :
 [[60 17]
 [11 96]]
The confusion matrix for  KNearest Neighbors Classifier  is :
 [[61 16]
 [16 91]]
The confusion matrix for  Naive Baye's Classifier  is :
 [[60 17]
 [14 93]]
The confusion matrix for  Decision Tree Classifier  is :
 [[60 17]
 [28 79]]
The confusion matrix for  Random Forest Classifier  is :
 [[60 17]
 [10 97]]
The confusion matrix for  Support Vecotr Classifier  is :
 [[61 16]
 [11 96]]


In [123]:
accuracy = []
for i in range(0,len(y_pred)):
  accuracy.append(accuracy_score(y_test,y_pred[i]))
  print("The accuracy score for ",names[i]," is :",accuracy_score(y_test,y_pred[i]))

The accuracy score for  Logistic Regression  is : 0.8478260869565217
The accuracy score for  KNearest Neighbors Classifier  is : 0.8260869565217391
The accuracy score for  Naive Baye's Classifier  is : 0.8315217391304348
The accuracy score for  Decision Tree Classifier  is : 0.7554347826086957
The accuracy score for  Random Forest Classifier  is : 0.8532608695652174
The accuracy score for  Support Vecotr Classifier  is : 0.8532608695652174
